From 703720e4d599857d052f0a65f2840224ce36ec5a Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 8 Feb 2012 15:50:01 -0800 Subject: [PATCH] hammer2 - create new branch, sync working trees from dillon & vsrinivas * hammer2 branch in main repo created. This branch will loosely track master with the additional hammer2 bits added. Any changes that have to be made to non-hammer2 bits will be done in master and merged into this branch instead of the other way around. * Merge dillon's and vsrinivas's work, placing the files in the appropriate places in the source tree. * Initial whitespace cleanup so git doesn't complain * Major adjustments to hammer2_disk.h taking into account all discussions between dillon and vsrinivas to date relative to the original design. (1) The media blockref was expanded from 32 to 64 bytes in order to accomodate up to a 192 bit cryptographic hash. (2) The new blockref will also support multiple crc/hash/check algorithms. (3) The new blockref will also support multiple block compression algorithms. (4) The new blockref supports a full 64-bit key as well as a key mask, and an explicit type field instead of overloading the functionality of data_off's 6-bit physical storage size radix. (5) The fully associative blockset was reduced from 16 to 8 blockref entries. (6) Support for multiple copies is possible within a fully associative blockset (also needed new fields in the new blockref). (7) The media volume structure has been fleshed out to support the new features. (8) The volume structure now also contains fields for configuring device paths for up to 8 copies, a fully associative blockset pointint to the root inode, and multiple sub-block crcs. --- lib/libstand/hammer2.c | 143 ++ sbin/mount_hammer2/Makefile | 7 + sbin/mount_hammer2/mount_hammer2.c | 71 + sys/vfs/hammer2/Makefile | 9 + sys/vfs/hammer2/hammer2.h | 689 ++++++++++ sys/vfs/hammer2/hammer2_disk.h | 682 ++++++++++ sys/vfs/hammer2/hammer2_icrc.c | 147 ++ sys/vfs/hammer2/hammer2_mount.h | 56 + sys/vfs/hammer2/hammer2_subr.c | 1542 +++++++++++++++++++++ sys/vfs/hammer2/hammer2_vfsops.c | 1008 ++++++++++++++ sys/vfs/hammer2/hammer2_vnops.c | 2012 ++++++++++++++++++++++++++++ 11 files changed, 6366 insertions(+) create mode 100644 lib/libstand/hammer2.c create mode 100644 sbin/mount_hammer2/Makefile create mode 100644 sbin/mount_hammer2/mount_hammer2.c create mode 100644 sys/vfs/hammer2/Makefile create mode 100644 sys/vfs/hammer2/hammer2.h create mode 100644 sys/vfs/hammer2/hammer2_disk.h create mode 100644 sys/vfs/hammer2/hammer2_icrc.c create mode 100644 sys/vfs/hammer2/hammer2_mount.h create mode 100644 sys/vfs/hammer2/hammer2_subr.c create mode 100644 sys/vfs/hammer2/hammer2_vfsops.c create mode 100644 sys/vfs/hammer2/hammer2_vnops.c diff --git a/lib/libstand/hammer2.c b/lib/libstand/hammer2.c new file mode 100644 index 0000000000..f4c0dea8da --- /dev/null +++ b/lib/libstand/hammer2.c @@ -0,0 +1,143 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +struct hammer2 { + int fd; /* Device fd */ + struct hammer2_blockref sroot; /* Superroot blockref */ +}; + +struct inode { + struct hammer2_inode_data dat; /* raw inode data */ + off_t doff; /* disk inode offset */ +}; + +off_t blockoff(ref) + struct hammer2_blockref ref; +{ + +} + +hinit(hfs) + struct hammer2 *hfs; +{ + struct hammer2_volume_data volhdr; + ssize_t rc; + hammer2_crc_t crc0; + + rc = pread(hfs->fd, &volhdr, HAMMER2_VOLUME_SIZE, 0); + if (volhdr.magic == HAMMER2_VOLUME_ID_HBO) { + printf("Valid HAMMER2 filesystem\n"); + } else { + return (-1); + } + + hfs->sroot = volhdr.sroot_blockref; + return (0); +} + +shread(hfs, ino, buf, off, len) + struct hammer2 *hfs; + struct inode *ino; + char *buf; + off_t off; + size_t len; +{ + /* + * Read [off, off+len) from inode ino rather than from disk + * offsets; correctly decodes blockrefs/indirs/... + */ +} + +struct inode *hlookup1(hfs, ino, name) + struct hammer2 *hfs; + struct inode *ino; + char *name; +{ + static struct inode filino; + off_t off; + int rc; + + bzero(&filino, sizeof(struct inode)); + + for (off = 0; + off < ino->dat.size; + off += sizeof(struct hammer2_inode_data)) + { + rc = shread(hfs, ino, &filino.dat, off, + sizeof(struct hammer2_inode_data)); + if (rc != sizeof(struct hammer2_inode_data)) + continue; + if (strcmp(name, &filino.dat.filename) == 0) + return (&filino); + } + + return (NULL); +} + +struct inode *hlookup(hfs, name) + struct hammer2 *hfs; + char *name; +{ + /* Name is of form /SUPERROOT/a/b/c/file */ + +} + +void hstat(hfs, ino, sb) + struct hammer2 *hfs; + struct inode *ino; + struct stat *sb; +{ + +} + +main(argc, argv) + int argc; + char *argv[]; +{ + struct hammer2 hammer2; + struct inode *ino; + struct stat sb; + int i; + + if (argc < 2) { + fprintf(stderr, "usage: hammer2 \n"); + exit(1); + } + + hammer2.fd = open(argv[1], O_RDONLY); + if (hammer2.fd < 0) { + fprintf(stderr, "unable to open %s\n", argv[1]); + exit(1); + } + + if (hinit(&hammer2)) { + fprintf(stderr, "invalid fs\n"); + close(hammer2.fd); + exit(1); + } + + for (i = 2; i < argc; i++) { + ino = hlookup(&hammer2, argv[i]); + if (ino == NULL) { + fprintf(stderr, "hlookup %s\n", argv[i]); + continue; + } + hstat(&hammer2, ino, &sb); + + printf("%s %lld", argv[i], sb.st_size); + + } +} diff --git a/sbin/mount_hammer2/Makefile b/sbin/mount_hammer2/Makefile new file mode 100644 index 0000000000..f3f4fc835d --- /dev/null +++ b/sbin/mount_hammer2/Makefile @@ -0,0 +1,7 @@ +PROG= mount_hammer2 +SRCS= mount_hammer2.c +MAN= + +CFLAGS+= -I${.CURDIR}/.. + +.include diff --git a/sbin/mount_hammer2/mount_hammer2.c b/sbin/mount_hammer2/mount_hammer2.c new file mode 100644 index 0000000000..a57a49174f --- /dev/null +++ b/sbin/mount_hammer2/mount_hammer2.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * by Venkatesh Srinivas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include +#include +#include + +#include +#include +#include + +/* + * Usage: mount_hammer2 [volume] [mtpt] + */ +int +main(int argc, char *argv[]) +{ + struct hammer2_mount_info info; + struct vfsconf vfc; + char *mountpt; + int error; + int mount_flags; + + bzero(&info, sizeof(info)); + mount_flags = 0; + + if (argc < 3) + exit(1); + + error = getvfsbyname("hammer2", &vfc); + + info.volume = argv[1]; + info.hflags = 0; + mountpt = argv[2]; + + error = mount(vfc.vfc_name, mountpt, mount_flags, &info); + if (error) { + perror("mount: "); + } +} diff --git a/sys/vfs/hammer2/Makefile b/sys/vfs/hammer2/Makefile new file mode 100644 index 0000000000..451d07693b --- /dev/null +++ b/sys/vfs/hammer2/Makefile @@ -0,0 +1,9 @@ +# Makefile for hammer2 vfs +# +# +.PATH: ${.CURDIR} + +KMOD= hammer2 +SRCS= hammer2_vfsops.c hammer2_vnops.c hammer2_subr.c hammer2_icrc.c + +.include diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h new file mode 100644 index 0000000000..7637a754b0 --- /dev/null +++ b/sys/vfs/hammer2/hammer2.h @@ -0,0 +1,689 @@ +/* + * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * by Venkatesh Srinivas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This header file contains structures used internally by the HAMMER2 + * implementation. See hammer2_disk.h for on-disk structures. + */ + +#ifndef _VFS_HAMMER2_HAMMER2_H_ +#define _VFS_HAMMER2_HAMMER2_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hammer2_disk.h" +#include "hammer2_mount.h" + +struct hammer2_inode; +struct hammer2_mount; + +struct hammer2_node; + +/* + * A hammer2 inode. + */ +struct hammer2_inode { + struct hammer2_mount *mp; + struct lock lk; + struct vnode *vp; + hammer2_tid_t inum; + unsigned char type; +}; + +#define HAMMER2_INODE_TYPE_DIR 0x01 +#define HAMMER2_INODE_TYPE_FILE 0x02 +#define HAMMER2_INODE_TYPE_ROOT 0x10 +#define HAMMER2_INODE_TYPE_MASK 0x07 + +/* --------------------------------------------------------------------- */ + +/* + * Internal representation of a hammer2 directory entry. + */ +struct hammer2_dirent { + TAILQ_ENTRY(hammer2_dirent) td_entries; + + /* Length of the name stored in this directory entry. This avoids + * the need to recalculate it every time the name is used. */ + uint16_t td_namelen; + + /* The name of the entry, allocated from a string pool. This + * string is not required to be zero-terminated; therefore, the + * td_namelen field must always be used when accessing its value. */ + char * td_name; + + /* Pointer to the node this entry refers to. */ + struct hammer2_node * td_node; +}; + +/* A directory in hammer2 holds a sorted list of directory entries, which in + * turn point to other files (which can be directories themselves). + * + * In hammer2, this list is managed by a tail queue, whose head is defined by + * the struct hammer2_dir type. + * + * It is imporant to notice that directories do not have entries for . and + * .. as other file systems do. These can be generated when requested + * based on information available by other means, such as the pointer to + * the node itself in the former case or the pointer to the parent directory + * in the latter case. This is done to simplify hammer2's code and, more + * importantly, to remove redundancy. */ +TAILQ_HEAD(hammer2_dir, hammer2_dirent); + +/* --------------------------------------------------------------------- */ + + + +/* --------------------------------------------------------------------- */ + +/* + * Internal representation of a hammer2 file system node. + * + * This structure is splitted in two parts: one holds attributes common + * to all file types and the other holds data that is only applicable to + * a particular type. The code must be careful to only access those + * attributes that are actually allowed by the node's type. + * + * + * Below is the key of locks used to protected the fields in the following + * structures. + * + */ +struct hammer2_node { + /* Doubly-linked list entry which links all existing nodes for a + * single file system. This is provided to ease the removal of + * all nodes during the unmount operation. */ + LIST_ENTRY(tmpfs_node) tn_entries; + + /* The node's type. Any of 'VBLK', 'VCHR', 'VDIR', 'VFIFO', + * 'VLNK', 'VREG' and 'VSOCK' is allowed. The usage of vnode + * types instead of a custom enumeration is to make things simpler + * and faster, as we do not need to convert between two types. */ + enum vtype tn_type; + + /* Node identifier. */ + ino_t tn_id; + + /* Node's internal status. This is used by several file system + * operations to do modifications to the node in a delayed + * fashion. */ + int tn_status; +#define TMPFS_NODE_ACCESSED (1 << 1) +#define TMPFS_NODE_MODIFIED (1 << 2) +#define TMPFS_NODE_CHANGED (1 << 3) + + /* The node size. It does not necessarily match the real amount + * of memory consumed by it. */ + off_t tn_size; + + /* Generic node attributes. */ + uid_t tn_uid; + gid_t tn_gid; + mode_t tn_mode; + int tn_flags; + nlink_t tn_links; + int32_t tn_atime; + int32_t tn_atimensec; + int32_t tn_mtime; + int32_t tn_mtimensec; + int32_t tn_ctime; + int32_t tn_ctimensec; + unsigned long tn_gen; + struct lockf tn_advlock; + + /* As there is a single vnode for each active file within the + * system, care has to be taken to avoid allocating more than one + * vnode per file. In order to do this, a bidirectional association + * is kept between vnodes and nodes. + * + * Whenever a vnode is allocated, its v_data field is updated to + * point to the node it references. At the same time, the node's + * tn_vnode field is modified to point to the new vnode representing + * it. Further attempts to allocate a vnode for this same node will + * result in returning a new reference to the value stored in + * tn_vnode. + * + * May be NULL when the node is unused (that is, no vnode has been + * allocated for it or it has been reclaimed). */ + struct vnode * tn_vnode; + + /* interlock to protect tn_vpstate */ + struct lock tn_interlock; + + /* Identify if current node has vnode assiocate with + * or allocating vnode. + */ + int tn_vpstate; + + /* misc data field for different tn_type node */ + union { + /* Valid when tn_type == VBLK || tn_type == VCHR. */ + dev_t tn_rdev; /*int32_t ?*/ + + /* Valid when tn_type == VDIR. */ + struct tn_dir{ + /* Pointer to the parent directory. The root + * directory has a pointer to itself in this field; + * this property identifies the root node. */ + struct tmpfs_node * tn_parent; + + /* Head of a tail-queue that links the contents of + * the directory together. See above for a + * description of its contents. */ + struct tmpfs_dir tn_dirhead; + + /* Number and pointer of the first directory entry + * returned by the readdir operation if it were + * called again to continue reading data from the + * same directory as before. This is used to speed + * up reads of long directories, assuming that no + * more than one read is in progress at a given time. + * Otherwise, these values are discarded and a linear + * scan is performed from the beginning up to the + * point where readdir starts returning values. */ + off_t tn_readdir_lastn; + struct tmpfs_dirent * tn_readdir_lastp; + }tn_dir; + + /* Valid when tn_type == VLNK. */ + /* The link's target, allocated from a string pool. */ + char * tn_link; + + /* Valid when tn_type == VREG. */ + struct tn_reg { + /* The contents of regular files stored in a tmpfs + * file system are represented by a single anonymous + * memory object (aobj, for short). The aobj provides + * direct access to any position within the file, + * because its contents are always mapped in a + * contiguous region of virtual memory. It is a task + * of the memory management subsystem (see uvm(9)) to + * issue the required page ins or page outs whenever + * a position within the file is accessed. */ + vm_object_t tn_aobj; + size_t tn_aobj_pages; + + }tn_reg; + + /* Valid when tn_type = VFIFO */ + struct tn_fifo { + int (*tn_fo_read) (struct file *fp, struct uio *uio, + struct ucred *cred, int flags); + int (*tn_fo_write) (struct file *fp, struct uio *uio, + struct ucred *cred, int flags); + }tn_fifo; + }tn_spec; +}; +LIST_HEAD(tmpfs_node_list, tmpfs_node); + +#define tn_rdev tn_spec.tn_rdev +#define tn_dir tn_spec.tn_dir +#define tn_link tn_spec.tn_link +#define tn_reg tn_spec.tn_reg +#define tn_fifo tn_spec.tn_fifo + +#define TMPFS_NODE_LOCK(node) lockmgr(&(node)->tn_interlock, LK_EXCLUSIVE|LK_RETRY) +#define TMPFS_NODE_UNLOCK(node) lockmgr(&(node)->tn_interlock, LK_RELEASE) +#define TMPFS_NODE_MTX(node) (&(node)->tn_interlock) + +#ifdef INVARIANTS +#define TMPFS_ASSERT_LOCKED(node) do { \ +KKASSERT(node != NULL); \ +KKASSERT(node->tn_vnode != NULL); \ +if (!vn_islocked(node->tn_vnode) && \ +(lockstatus(TMPFS_NODE_MTX(node), curthread) == LK_EXCLUSIVE )) \ +panic("tmpfs: node is not locked: %p", node); \ +} while (0) +#define TMPFS_ASSERT_ELOCKED(node) do { \ +KKASSERT((node) != NULL); \ +KKASSERT(lockstatus(TMPFS_NODE_MTX(node), curthread) == LK_EXCLUSIVE); \ +} while (0) +#else +#define TMPFS_ASSERT_LOCKED(node) (void)0 +#define TMPFS_ASSERT_ELOCKED(node) (void)0 +#endif + +#define TMPFS_VNODE_ALLOCATING 1 +#define TMPFS_VNODE_WANT 2 +#define TMPFS_VNODE_DOOMED 4 +/* --------------------------------------------------------------------- */ + + + +/* + * Governing mount structure for filesystem (aka vp->v_mount) + */ +struct hammer2_mount { + struct mount *hm_mp; + int hm_ronly; /* block device mounted read-only */ + struct vnode *hm_devvp; /* device vnode */ + struct lock hm_lk; + + /* Root inode */ + struct hammer2_inode *hm_iroot; + + /* Per-mount inode zone */ + struct malloc_type *hm_inodes; + int hm_ninodes; + int hm_maxinodes; + + struct malloc_type *hm_ipstacks; + int hm_nipstacks; + int hm_maxipstacks; + + struct hammer2_volume_data hm_sb; + + + /*** TMPFS_MOUNT ***/ + + + /* Maximum number of memory pages available for use by the file + * system, set during mount time. This variable must never be + * used directly as it may be bigger than the current amount of + * free memory; in the extreme case, it will hold the SIZE_MAX + * value. Instead, use the TMPFS_PAGES_MAX macro. */ + vm_pindex_t tm_pages_max; + + /* Number of pages in use by the file system. Cannot be bigger + * than the value returned by TMPFS_PAGES_MAX in any case. */ + vm_pindex_t tm_pages_used; + + /* Pointer to the node representing the root directory of this + * file system. */ + struct tmpfs_node * tm_root; + + /* Maximum number of possible nodes for this file system; set + * during mount time. We need a hard limit on the maximum number + * of nodes to avoid allocating too much of them; their objects + * cannot be released until the file system is unmounted. + * Otherwise, we could easily run out of memory by creating lots + * of empty files and then simply removing them. */ + ino_t tm_nodes_max; + + /* Number of nodes currently that are in use. */ + ino_t tm_nodes_inuse; + + /* maximum representable file size */ + u_int64_t tm_maxfilesize; + + /* Nodes are organized in two different lists. The used list + * contains all nodes that are currently used by the file system; + * i.e., they refer to existing files. The available list contains + * all nodes that are currently available for use by new files. + * Nodes must be kept in this list (instead of deleting them) + * because we need to keep track of their generation number (tn_gen + * field). + * + * Note that nodes are lazily allocated: if the available list is + * empty and we have enough space to create more nodes, they will be + * created and inserted in the used list. Once these are released, + * they will go into the available list, remaining alive until the + * file system is unmounted. */ + struct tmpfs_node_list tm_nodes_used; + + /* All node lock to protect the node list and tmp_pages_used */ + struct lock allnode_lock; + + /* Per-mount malloc zones for tmpfs nodes, names, and dirents */ + struct malloc_type *tm_node_zone; + struct malloc_type *tm_dirent_zone; + struct malloc_type *tm_name_zone; + + struct objcache_malloc_args tm_node_zone_malloc_args; + struct objcache_malloc_args tm_dirent_zone_malloc_args; + + /* Pools used to store file system meta data. These are not shared + * across several instances of tmpfs for the reasons described in + * tmpfs_pool.c. */ + struct objcache *tm_dirent_pool; + struct objcache *tm_node_pool; + + int tm_flags; +}; + +#if defined(_KERNEL) + +MALLOC_DECLARE(M_HAMMER2); + + +static inline struct mount * +H2TOMP(struct hammer2_mount *hmp) +{ + return (struct mount *) hmp->hm_mp; +} + +#define VTOI(vp) ((struct hammer2_inode *) (vp)->v_data) +#define ITOV(ip) ((ip)->vp) + +extern struct vop_ops hammer2_vnode_vops; +extern struct vop_ops hammer2_spec_vops; +extern struct vop_ops hammer2_fifo_vops; + +/* hammer2_inode.c */ + +extern int hammer2_inactive(struct vop_inactive_args *); +extern int hammer2_reclaim(struct vop_reclaim_args *); + +/* hammer2_subr.c */ + +extern struct vnode *igetv(struct hammer2_inode *, int *); + +extern void hammer2_mount_exlock(struct hammer2_mount *); +extern void hammer2_mount_shlock(struct hammer2_mount *); +extern void hammer2_mount_unlock(struct hammer2_mount *); + + + +#endif /* kernel */ +#endif + +#ifndef _VFS_TMPFS_TMPFS_H_ +#define _VFS_TMPFS_TMPFS_H_ + +/* --------------------------------------------------------------------- + * KERNEL-SPECIFIC DEFINITIONS + * --------------------------------------------------------------------- */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* --------------------------------------------------------------------- */ +#include +#include +#include +#include + +MALLOC_DECLARE(M_TMPFSMNT); + +/* Each entry in a directory has a cookie that identifies it. Cookies + * supersede offsets within directories because, given how tmpfs stores + * directories in memory, there is no such thing as an offset. (Emulating + * a real offset could be very difficult.) + * + * The '.', '..' and the end of directory markers have fixed cookies which + * cannot collide with the cookies generated by other entries. The cookies + * for the other entries are generated based on the memory address on which + * stores their information is stored. + * + * Ideally, using the entry's memory pointer as the cookie would be enough + * to represent it and it wouldn't cause collisions in any system. + * Unfortunately, this results in "offsets" with very large values which + * later raise problems in the Linux compatibility layer (and maybe in other + * places) as described in PR kern/32034. Hence we need to workaround this + * with a rather ugly hack. + * + * Linux 32-bit binaries, unless built with _FILE_OFFSET_BITS=64, have off_t + * set to 'long', which is a 32-bit *signed* long integer. Regardless of + * the macro value, GLIBC (2.3 at least) always uses the getdents64 + * system call (when calling readdir) which internally returns off64_t + * offsets. In order to make 32-bit binaries work, *GLIBC* converts the + * 64-bit values returned by the kernel to 32-bit ones and aborts with + * EOVERFLOW if the conversion results in values that won't fit in 32-bit + * integers (which it assumes is because the directory is extremely large). + * This wouldn't cause problems if we were dealing with unsigned integers, + * but as we have signed integers, this check fails due to sign expansion. + * + * For example, consider that the kernel returns the 0xc1234567 cookie to + * userspace in a off64_t integer. Later on, GLIBC casts this value to + * off_t (remember, signed) with code similar to: + * system call returns the offset in kernel_value; + * off_t casted_value = kernel_value; + * if (sizeof(off_t) != sizeof(off64_t) && + * kernel_value != casted_value) + * error! + * In this case, casted_value still has 0xc1234567, but when it is compared + * for equality against kernel_value, it is promoted to a 64-bit integer and + * becomes 0xffffffffc1234567, which is different than 0x00000000c1234567. + * Then, GLIBC assumes this is because the directory is very large. + * + * Given that all the above happens in user-space, we have no control over + * it; therefore we must workaround the issue here. We do this by + * truncating the pointer value to a 32-bit integer and hope that there + * won't be collisions. In fact, this will not cause any problems in + * 32-bit platforms but some might arise in 64-bit machines (I'm not sure + * if they can happen at all in practice). + * + * XXX A nicer solution shall be attempted. */ +#ifdef _KERNEL +#define TMPFS_DIRCOOKIE_DOT 0 +#define TMPFS_DIRCOOKIE_DOTDOT 1 +#define TMPFS_DIRCOOKIE_EOF 2 +static __inline +off_t +tmpfs_dircookie(struct tmpfs_dirent *de) +{ + off_t cookie; + + cookie = ((off_t)(uintptr_t)de >> 1) & 0x7FFFFFFF; + KKASSERT(cookie != TMPFS_DIRCOOKIE_DOT); + KKASSERT(cookie != TMPFS_DIRCOOKIE_DOTDOT); + KKASSERT(cookie != TMPFS_DIRCOOKIE_EOF); + + return cookie; +} +#endif + + +/* + * Internal representation of a tmpfs mount point. + */ + +#define TMPFS_LOCK(tm) lockmgr(&(tm)->allnode_lock, LK_EXCLUSIVE|LK_RETRY) +#define TMPFS_UNLOCK(tm) lockmgr(&(tm)->allnode_lock, LK_RELEASE) + +/* --------------------------------------------------------------------- */ + +/* + * This structure maps a file identifier to a tmpfs node. Used by the + * NFS code. + */ +struct tmpfs_fid { + uint16_t tf_len; + uint16_t tf_pad; + ino_t tf_id; + unsigned long tf_gen; +}; + +/* --------------------------------------------------------------------- */ + +#ifdef _KERNEL +/* + * Prototypes for tmpfs_subr.c. + */ + +int tmpfs_alloc_node(struct hammer2_mount *, enum vtype, + uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *, + char *, int, int, struct tmpfs_node **); +void tmpfs_free_node(struct hammer2_mount *, struct tmpfs_node *); +int tmpfs_alloc_dirent(struct hammer2_mount *, struct tmpfs_node *, + const char *, uint16_t, struct tmpfs_dirent **); +void tmpfs_free_dirent(struct hammer2_mount *, struct tmpfs_dirent *); +int tmpfs_alloc_vp(struct mount *, struct tmpfs_node *, int, + struct vnode **); +void tmpfs_free_vp(struct vnode *); +int tmpfs_alloc_file(struct vnode *, struct vnode **, struct vattr *, + struct namecache *, struct ucred *, char *); +void tmpfs_dir_attach(struct tmpfs_node *, struct tmpfs_dirent *); +void tmpfs_dir_detach(struct tmpfs_node *, struct tmpfs_dirent *); +struct tmpfs_dirent * tmpfs_dir_lookup(struct tmpfs_node *node, + struct tmpfs_node *f, + struct namecache *ncp); +int tmpfs_dir_getdotdent(struct tmpfs_node *, struct uio *); +int tmpfs_dir_getdotdotdent(struct hammer2_mount *, + struct tmpfs_node *, struct uio *); +struct tmpfs_dirent * tmpfs_dir_lookupbycookie(struct tmpfs_node *, off_t); +int tmpfs_dir_getdents(struct tmpfs_node *, struct uio *, off_t *); +int tmpfs_reg_resize(struct vnode *, off_t, int); +int tmpfs_chflags(struct vnode *, int, struct ucred *); +int tmpfs_chmod(struct vnode *, mode_t, struct ucred *); +int tmpfs_chown(struct vnode *, uid_t, gid_t, struct ucred *); +int tmpfs_chsize(struct vnode *, u_quad_t, struct ucred *); +int tmpfs_chtimes(struct vnode *, struct timespec *, struct timespec *, + int, struct ucred *); +void tmpfs_itimes(struct vnode *, const struct timespec *, + const struct timespec *); + +void tmpfs_update(struct vnode *); +int tmpfs_truncate(struct vnode *, off_t); +int tmpfs_node_ctor(void *obj, void *privdata, int flags); + +/* --------------------------------------------------------------------- */ + +/* + * Convenience macros to simplify some logical expressions. + */ +#define IMPLIES(a, b) (!(a) || (b)) +#define IFF(a, b) (IMPLIES(a, b) && IMPLIES(b, a)) + +/* --------------------------------------------------------------------- */ + +/* + * Checks that the directory entry pointed by 'de' matches the name 'name' + * with a length of 'len'. + */ +#define TMPFS_DIRENT_MATCHES(de, name, len) \ +(de->td_namelen == (uint16_t)len && \ +bcmp((de)->td_name, (name), (de)->td_namelen) == 0) + +/* --------------------------------------------------------------------- */ + +/* + * Ensures that the node pointed by 'node' is a directory and that its + * contents are consistent with respect to directories. + */ +#define TMPFS_VALIDATE_DIR(node) \ +KKASSERT((node)->tn_type == VDIR); \ +KKASSERT((node)->tn_size % sizeof(struct tmpfs_dirent) == 0); \ +KKASSERT((node)->tn_dir.tn_readdir_lastp == NULL || \ +tmpfs_dircookie((node)->tn_dir.tn_readdir_lastp) == (node)->tn_dir.tn_readdir_lastn); + +#endif + +/* --------------------------------------------------------------------- */ + +/* + * Macros/functions to convert from generic data structures to tmpfs + * specific ones. + */ + +static inline +struct hammer2_mount * +VFS_TO_TMPFS(struct mount *mp) +{ + struct hammer2_mount *tmp; + + KKASSERT((mp) != NULL && (mp)->mnt_data != NULL); + tmp = (struct hammer2_mount *)(mp)->mnt_data; + return tmp; +} + +static inline +struct tmpfs_node * +VP_TO_TMPFS_NODE(struct vnode *vp) +{ + struct tmpfs_node *node; + + KKASSERT((vp) != NULL && (vp)->v_data != NULL); + node = (struct tmpfs_node *)vp->v_data; + return node; +} + +static inline +struct tmpfs_node * +VP_TO_TMPFS_DIR(struct vnode *vp) +{ + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + TMPFS_VALIDATE_DIR(node); + return node; +} + +/* --------------------------------------------------------------------- */ +/* + * buffer cache size + */ +#define BSIZE (off_t)16384 /* buffer cache size*/ +#define BMASK (off_t)(BSIZE - 1) + +extern struct vop_ops tmpfs_vnode_vops; +extern struct vop_ops tmpfs_fifo_vops; + +/* + * Declarations for tmpfs_vnops.c. + */ + +int tmpfs_access(struct vop_access_args *); +int tmpfs_getattr(struct vop_getattr_args *); +int tmpfs_setattr(struct vop_setattr_args *); +int tmpfs_reclaim(struct vop_reclaim_args *); + + +#endif /* _VFS_TMPFS_TMPFS_H_ */ + +#ifndef _MORE_HAMMER2_ +#define _MORE_HAMMER2_ +static inline struct hammer2_mount * +MPTOH2(struct mount *mp) +{ + return (struct hammer2_mount *) mp->mnt_data; +} +#endif diff --git a/sys/vfs/hammer2/hammer2_disk.h b/sys/vfs/hammer2/hammer2_disk.h new file mode 100644 index 0000000000..c9ee06f29c --- /dev/null +++ b/sys/vfs/hammer2/hammer2_disk.h @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2011-2012 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * by Venkatesh Srinivas + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#ifndef VFS_HAMMER2_DISK_H_ +#define VFS_HAMMER2_DISK_H_ + +/* + * The structures below represent the on-disk media structures for the HAMMER2 + * filesystem. Note that all fields for on-disk structures are naturally + * aligned. The host endian format is typically used - compatibility is + * possible if the implementation detects reversed endian and adjusts accesses + * accordingly. + * + * HAMMER2 primarily revolves around the directory topology: inodes, + * directory entries, and block tables. Block device buffer cache buffers + * are always 64KB. Logical file buffers are typically 16KB. All data + * references utilize 64-bit byte offsets. + * + * Free block management is handled independently using blocks reserved by + * the media topology. + */ + +/* + * The data at the end of a file or directory may be a fragment in order + * to optimize storage efficiency. The minimum fragment size is 64 bytes. + * Since allocations are in powers of 2 fragments must also be sized in + * powers of 2 (64, 128, 256, ... 65536). + * + * For the moment the maximum allocation size is HAMMER2_PBUFSIZE (64K), + * which is 2^16. Larger extents may be supported in the future. + * + * A full indirect block uses supports 1024 x 64-byte blockrefs. + * + * A maximally sized file (2^64-1 bytes) requires 5 indirect block levels. + * The hammer2_blockset in the volume header or file inode has another 8 + * entries, giving us 66+3 = 69 bits of address space. However, some bits + * are taken up by (potentially) requests for redundant copies. HAMMER2 + * currently supports up to 8 copies, which brings the address space down + * to 66 bits and gives us 2 bits of leeway. + */ +#define HAMMER2_MIN_ALLOC 64 /* minimum allocation size */ +#define HAMMER2_MIN_RADIX 6 /* minimum allocation size 2^N */ +#define HAMMER2_MAX_RADIX 16 /* maximum allocation size 2^N */ +#define HAMMER2_IND1_RADIX 26 /* lowest full indirect block radix */ +#define HAMMER2_IND2_RADIX 36 +#define HAMMER2_IND3_RADIX 46 +#define HAMMER2_IND4_RADIX 56 +#define HAMMER2_IND5_RADIX 66 /* highest full indirect block radix */ + +/* + * HAMMER2 utilizes 64K physical buffers and 16K logical filesystem buffers. + * The smaller logical filesystem buffers reduce ram waste when the OS is + * caching lots of small files. + */ +#define HAMMER2_PBUFRADIX 16 /* physical buf (1<<16) bytes */ +#define HAMMER2_PBUFSIZE 65536 /* fixed physical device buffer size */ +#define HAMMER2_LBUFSIZE 16384 /* vnode/logical file buffer size */ + +/* + * HAMMER2 processes blockrefs in sets of 8. The set is fully associative, + * is not sorted, and may contain holes. + * + * A full indirect block supports 1024 blockrefs. + * + * An inode embeds one set of blockrefs but may also use the data area for + * up to 512 bytes of direct data. + */ +#define HAMMER2_SET_COUNT 8 /* direct entries & associativity */ +#define HAMMER2_SET_RADIX 3 +#define HAMMER2_IND_COUNT 1024 /* 1 << HAMMER2_IND_RADIX */ +#define HAMMER2_IND_RADIX 10 +#define HAMMER2_EMBEDDED_BYTES 512 +#define HAMMER2_EMBEDDED_RADIX 9 + +#define HAMMER2_PBUFMASK (HAMMER2_PBUFSIZE - 1) +#define HAMMER2_LBUFMASK (HAMMER2_LBUFSIZE - 1) + +#define HAMMER2_PBUFSIZE64 ((hammer2_off_t)HAMMER2_PBUFSIZE) +#define HAMMER2_PBUFMASK64 ((hammer2_off_t)HAMMER2_PBUFMASK) +#define HAMMER2_LBUFMASK64 ((hammer2_off_t)HAMMER2_LBUFMASK) + +#define HAMMER2_UUID_STRING "5cbb9ad1-862d-11dc-a94d-01301bb8a9f5" + +/* + * A HAMMER2 filesystem is always sized in multiples of 8MB. + * + * A 2MB segment is reserved at the beginning of each 2GB zone. This segment + * contains the volume header and the free block table. + */ +#define HAMMER2_VOLUME_ALIGN (8 * 1024 * 1024) +#define HAMMER2_VOLUME_ALIGN64 ((hammer2_off_t)HAMMER2_VOLUME_ALIGN) +#define HAMMER2_VOLUME_ALIGNMASK (HAMMER2_VOLUME_ALIGN - 1) +#define HAMMER2_VOLUME_ALIGNMASK64 ((hammer2_off_t)HAMMER2_VOLUME_ALIGNMASK) + +#define HAMMER2_NEWFS_ALIGN (HAMMER2_VOLUME_ALIGN) +#define HAMMER2_NEWFS_ALIGN64 ((hammer2_off_t)HAMMER2_VOLUME_ALIGN) +#define HAMMER2_NEWFS_ALIGNMASK (HAMMER2_VOLUME_ALIGN - 1) +#define HAMMER2_NEWFS_ALIGNMASK64 ((hammer2_off_t)HAMMER2_NEWFS_ALIGNMASK) + +#define HAMMER2_RESERVE_BYTES64 (2LLU * 1024 * 1024 * 1024) +#define HAMMER2_RESERVE_MASK64 (HAMMER2_RESERVE_BYTES64 - 1) +#define HAMMER2_RESERVE_SEG (2 * 1024 * 1024) +#define HAMMER2_RESERVE_SEG64 ((hammer2_off_t)HAMMER2_RESERVE_SEG) +#define HAMMER2_RESERVE_SEG_ENTRIES (HAMMER2_RESERVE_SEG/HAMMER2_BUFSIZE) + +/* + * Two linear areas can be reserved after the initial 2MB segment in the base + * zone (the one starting at offset 0). These areas are NOT managed by the + * block allocator and do not fall under HAMMER2 crc checking rules based + * at the volume header (but can be self-CRCd internally, depending). + */ +#define HAMMER2_BOOT_MIN_BYTES HAMMER2_VOLUME_ALIGN +#define HAMMER2_BOOT_NOM_BYTES (64*1024*1024) +#define HAMMER2_BOOT_MAX_BYTES (256*1024*1024) + +#define HAMMER2_REDO_MIN_BYTES HAMMER2_VOLUME_ALIGN +#define HAMMER2_REDO_NOM_BYTES (256*1024*1024) +#define HAMMER2_REDO_MAX_BYTES (1024*1024*1024) + +/* + * Most HAMMER2 types are implemented as unsigned 64-bit integers. + * Transaction ids are monotonic. + * + * We utilize 32-bit iSCSI CRCs. + */ +typedef uint64_t hammer2_tid_t; +typedef uint64_t hammer2_off_t; +typedef uint64_t hammer2_key_t; +typedef uint32_t hammer2_crc32_t; + +/* + * Miscellanious ranges (all are unsigned). + */ +#define HAMMER2_MIN_TID 1ULL +#define HAMMER2_MAX_TID 0xFFFFFFFFFFFFFFFFULL +#define HAMMER2_MIN_KEY 0ULL +#define HAMMER2_MAX_KEY 0xFFFFFFFFFFFFFFFFULL +#define HAMMER2_MIN_OFFSET 0ULL +#define HAMMER2_MAX_OFFSET 0xFFFFFFFFFFFFFFFFULL + +/* + * HAMMER2 data offset special cases and masking. + * + * All HAMMER2 data offsets have to be broken down into a 64K buffer base + * offset (HAMMER2_OFF_MASK_HI) and a 64K buffer index (HAMMER2_OFF_MASK_LO). + * + * Indexes into physical buffers are always 64-byte aligned. The low 6 bits + * of the data offset field specifies how large the data chunk being pointed + * to as a power of 2. This value typically ranges from HAMMER2_MIN_RADIX + * to HAMMER2_MAX_RADIX (6-16). Larger values may be supported in the future + * to support file extents. + */ +#define HAMMER2_OFF_BAD ((hammer2_off_t)-1) +#define HAMMER2_OFF_MASK 0xFFFFFFFFFFFFFFC0ULL +#define HAMMER2_OFF_MASK_LO (HAMMER2_OFF_MASK & HAMMER2_PBUFMASK64) +#define HAMMER2_OFF_MASK_HI (~HAMMER2_PBUFMASK64) +#define HAMMER2_OFF_MASK_RADIX 0x000000000000003FULL +#define HAMMER2_MAX_COPIES 6 + +/* + * The media block reference structure. This forms the core of the HAMMER2 + * media topology recursion. This 64-byte data structure is embedded in the + * volume header, in inodes (which are also directory entries), and in + * indirect blocks. + * + * A blockref references a single media item, which typically can be a + * directory entry (aka inode), indirect block, or data block. + * + * The primary feature a blockref represents is the ability to validate + * the entire tree underneath it via its check code. Any modification to + * anything propagates up the blockref tree all the way to the root, replacing + * the related blocks. Propagations can shortcut to the volume root to + * implement the 'fast syncing' feature but this only delays the eventual + * propagation. + * + * The check code can be a simple 32-bit iscsi code, a 64-bit crc, + * or as complex as a 192 bit cryptographic hash. 192 bits is the maximum + * supported check code size, which is not sufficient for unverified dedup + * UNLESS one doesn't mind once-in-a-blue-moon data corruption (such as when + * farming web data). HAMMER2 has an unverified dedup feature for just this + * purpose. + */ +struct hammer2_blockref { /* MUST BE EXACTLY 64 BYTES */ + uint8_t type; /* type of underlying item */ + uint8_t methods; /* check method & compression method */ + uint8_t copyid; /* specify which copy this is */ + uint8_t keybits; /* key mask bits for recursion */ + uint8_t vradix; + uint8_t reserved05; + uint8_t reserved06; + uint8_t reserved07; + hammer2_key_t key; /* key specification */ + hammer2_tid_t mirror_tid; /* propagate for mirror scan */ + hammer2_tid_t modify_tid; /* modifications sans propagation */ + hammer2_off_t data_off; /* low 6 bits is phys size (radix)*/ + union { /* check info */ + char buf[24]; + struct { + uint32_t value; + uint32_t unused[5]; + } iscsi32; + struct { + uint64_t value; + uint64_t unused[2]; + } crc64; + struct { + char data[24]; + } sha192; + } check; +}; + +typedef struct hammer2_blockref hammer2_blockref_t; + +#define HAMMER2_BLOCKREF_BYTES 64 /* blockref struct in bytes */ +#define HAMMER2_ENC_COMPMETHOD(n) (n) +#define HAMMER2_ENC_CHECKMETHOD(n) ((n) << 4) +#define HAMMER2_DEC_COMPMETHOD(n) ((n) & 15) +#define HAMMER2_DEC_CHECKMETHOD(n) (((n) >> 4) & 15) + +/* + * HAMMER2 block references are collected into sets of 8 blockrefs. These + * sets are fully associative, meaning the elements making up a set are + * not sorted in any way and may contain duplicate entries, holes, or + * entries which shortcut multiple levels of indirection. Sets are used + * in various ways: + * + * (1) When redundancy is desired a set may contain several duplicate + * entries pointing to different copies of the same data. Up to 8 copies + * are supported but the set structure becomes a bit inefficient once + * you go over 4. + * + * (2) The blockrefs in a set can shortcut multiple levels of indirections + * within the bounds imposed by the parent of set. + * + * When a set fills up another level of indirection is inserted, moving + * some or all of the set's contents into indirect blocks placed under the + * set. This is a top-down approach in that indirect blocks are not created + * until the set actually becomes full (that is, the entries in the set can + * shortcut the indirect blocks when the set is not full). Depending on how + * things are filled multiple indirect blocks will eventually be created. + */ +struct hammer2_blockset { + hammer2_blockref_t refs[HAMMER2_SET_COUNT]; +}; + +/* + * Catch programmer snafus + */ +#if (1 << HAMMER2_IND_RADIX) != HAMMER2_IND_COUNT +#error "hammer2 indirect radix is incorrect" +#endif +#if (HAMMER2_IND_COUNT * 64) != HAMMER2_BUFSIZE +#error "hammer2 indirect entries is incorrect" +#endif +#if (1 << HAMMER2_SET_RADIX) != HAMMER2_SET_COUNT +#error "hammer2 direct radix is incorrect" +#endif +#if (1 << HAMMER2_PBUFRADIX) != HAMMER2_PBUFSIZE +#error "HAMMER2_PBUFRADIX and HAMMER2_PBUFSIZE are inconsistent" +#endif +#if (1 << HAMMER2_MIN_RADIX) != HAMMER2_MIN_ALLOC +#error "HAMMER2_MIN_RADIX and HAMMER2_MIN_ALLOC are inconsistent" +#endif + +/* + * The media indirect block structure. + */ +struct hammer2_indblock { + hammer2_blockref_t blocks[HAMMER2_IND_COUNT]; +}; + +typedef struct hammer2_indblock hammer2_indblock_t; + +/* + * In HAMMER2 inodes ARE directory entries, with a special exception for + * hardlinks. The inode number is stored in the inode rather than being + * based on the location of the inode (since the location moves every time + * the inode or anything underneath the inode is modified). + * + * The inode is 1024 bytes, made up of 256 bytes of meta-data, 256 bytes + * for the filename, and 512 bytes worth of direct file data OR an embedded + * blockset. + * + * Directories represent one inode per blockref. Inodes are not laid out + * as a file but instead are represented by the related blockrefs. The + * blockrefs, in turn, are indexed by the 64-bit directory hash key. Remember + * that blocksets are fully associative, so a certain degree efficiency is + * achieved just from that. + * + * Up to 512 bytes of direct data can be embedded in an inode, and since + * inodes are essentially directory entries this also means that small data + * files end up simply being laid out linearly in the directory, resulting + * in fewer seeks and highly optimal access. + * + * The compression mode can be changed at any time in the inode and is + * recorded on a blockref-by-blockref basis. + * + * Hardlinks are supported via the inode map. Essentially the way a hardlink + * works is that all individual directory entries representing the same file + * are special cased and specify the same inode number. The actual file + * is placed in the nearest parent directory that is parent to all instances + * of the hardlink. If all hardlinks to a file are in the same directory + * the actual file will also be placed in that directory. This file uses + * the inode number as the directory entry key and is invisible to normal + * directory scans. Real directory entry keys are differentiated from the + * inode number key via bit 63. Access to the hardlink silently looks up + * the real file and forwards all operations to that file. Removal of the + * last hardlink also removes the real file. + */ +#define HAMMER2_INODE_BYTES 1024 /* (asserted by code) */ +#define HAMMER2_INODE_MAXNAME 256 /* maximum name in bytes */ +#define HAMMER2_INODE_VERSION_ONE 1 + +struct hammer2_inode_data { + uint16_t version; /* 0000 inode data version */ + uint16_t reserved02; /* 0002 */ + uint32_t uflags; /* 0004 chflags */ + uint32_t rmajor; /* 0008 available for device nodes */ + uint32_t rminor; /* 000C available for device nodes */ + uint64_t ctime; /* 0010 inode change time */ + uint64_t mtime; /* 0018 modified time */ + uint64_t atime; /* 0020 access time (unsupported) */ + uint64_t btime; /* 0028 birth time */ + uuid_t uid; /* 0030 uid / degenerate unix uid */ + uuid_t gid; /* 0040 gid / degenerate unix gid */ + + uint8_t type; /* 0050 object type */ + uint8_t op_flags; /* 0051 operational flags */ + uint16_t cap_flags; /* 0052 capability flags */ + uint32_t mode; /* 0054 unix modes (typ low 16 bits) */ + + hammer2_tid_t inum; /* 0058 inode number */ + hammer2_off_t size; /* 0060 size of file */ + uint64_t nlinks; /* 0068 hard links (typ only dirs) */ + hammer2_tid_t iparent; /* 0070 parent inum (recovery only) */ + uint64_t reserved78; /* 0078 */ + + hammer2_off_t data_quota; /* 0080 subtree quota in bytes */ + hammer2_off_t data_count; /* 0088 subtree byte count */ + hammer2_off_t inode_quota; /* 0090 subtree quota inode count */ + hammer2_off_t inode_count; /* 0098 subtree inode count */ + uint16_t name_len; /* 00A0 filename length */ + uint8_t comp_algo; /* 00A2 compression request & algo */ + uint8_t reservedA3; /* 00A3 */ + uint32_t reservedA4; /* 00A4 */ + hammer2_key_t name_key; /* 00A8 full filename key */ + uint8_t copyids[8]; /* 00B0 request copies to (up to 8) */ + uint64_t reservedB8; /* 00B8 */ + uint64_t reservedC0; /* 00C0 */ + uint64_t reservedC8; /* 00C8 */ + uint64_t reservedD0; /* 00D0 */ + uint64_t reservedD8; /* 00D8 */ + uint64_t reservedE0; /* 00E0 */ + uint64_t reservedE8; /* 00E8 */ + uint64_t reservedF0; /* 00F0 */ + uint64_t reservedF8; /* 00F8 */ + + char filename[HAMMER_INODE_MAXNAME]; + /* 0100-01FF (256 char, unterminated) */ + union { /* 0200-03FF (64x8 = 512 bytes) */ + struct hammer2_blockset blockset; + char data[HAMMER2_EMBEDDED_BYTES]; + } u; +}; + +#define HAMMER2_OPFLAG_DIRECTDATA 0x01 + +#define HAMMER2_OBJTYPE_UNKNOWN 0 +#define HAMMER2_OBJTYPE_DIRECTORY 1 +#define HAMMER2_OBJTYPE_REGFILE 2 +#define HAMMER2_OBJTYPE_FIFO 4 +#define HAMMER2_OBJTYPE_CDEV 5 +#define HAMMER2_OBJTYPE_BDEV 6 +#define HAMMER2_OBJTYPE_SOFTLINK 7 +#define HAMMER2_OBJTYPE_HARDLINK 8 +#define HAMMER2_OBJTYPE_SOCKET 9 +#define HAMMER2_OBJTYPE_WHITEOUT 10 + +#if 0 +/* + * HAMMER2 special blocks, 128 64K buffers at the beginning of each 2GB segment. + */ +#define HAMMER2_SPECBLOCK(n) (HAMMER2_PBUFSIZE64 * (n)) + +#define HAMMER2_SBLOCK_VOLHDR (0) +#define HAMMER2_SBLOCK_FREEMAP_ROOT(side) (1 + (8 * (side))) +#define HAMMER2_SBLOCK_FREEMAP_L1(side) (2 + (8 * (side))) +#define HAMMER2_SBLOCK_FREEMAP_L2(side) (3 + (8 * (side))) +#define HAMMER2_SBLOCK_FREEMAP_LEAF(side, n) (4 + (8 * (side)) + (n)) + +/* + * The allocref structure represents the allocation table. One 64K block + * is broken down into 4096 x 16 byte entries. Each indirect block chops + * 11 bits off the 64-bit storage space, with leaf entries representing + * 64KB blocks. So: (12, 12, 12, 12, 16) = 64 bit storage space. + * + * Each 64K allocmap block breaks the 4096 entries into a 64x64 tree with + * big_hint1 representing the top level every 64th entry and big_hint2 + * representing the lower level in each entry. These fields specify the + * largest contiguous radix (1-63) available for allocation in the related + * sub-tree. The largest contiguous radix available for the entire block + * is saved in the parent (for the root this will be alloc_blockref in the + * volume header). The hints may be larger than actual and will be corrected + * on the fly but must not be smaller. The allocator uses the hints to + * very quickly locate nearby blocks of the desired size. + * + * In indirect blocks the 64-bit free[_or_mask] field stores the total free + * space for each of the 4096 sub-nodes in bytes. The total free space + * represented by the indirect block is stored in its parent. + * + * Each leaf element represents a 64K block. A bitmap replaces the free space + * count, giving us a 1KB allocation resolution. A micro-allocation append + * offset replaces the icrc field. The micro-allocation feature is not + * currently implemented and the field will be set to 65536. + * + * The allocation map uses reserved blocks so no data block reference is + * required, only a bit in the flags field to specify which of two possible + * reserved blocks to use. This allows the allocation map to be flushed to + * disk with minimal synchronization. + */ +struct hammer2_allocref { + uint32_t icrc_or_app; /* node: icrc, leaf: append offset */ + uint16_t flags; + uint8_t big_hint1; /* upper level hint */ + uint8_t big_hint2; /* lower level hint */ + uint64_t free_or_mask; /* node: free bytes, leaf: bitmask */ +}; + +typedef struct hammer2_allocref hammer2_allocref_t; + +/* + * WARNING - allocref size x entries must equate to the hammer buffer size, + * and 12 bits per recursion is assumed by the allocator. + * + * ALTA-D Since no data_offset is specified flags are needed to select + * which sub-block to recurse down into for root & internal nodes. + * (only ALTA and ALTB is currently supported). + * + * LEAF Terminal entry, always set for leafs. May be used to support + * 4MB extent allocations and early termination in the future. + * (not required to shortcut allocation scans as the big_hint1/2 + * fields are used for this). + */ +#define HAMMER2_ALLOCREF_BYTES 16 /* structure size */ +#define HAMMER2_ALLOCREF_ENTRIES 4096 /* entries */ +#define HAMMER2_ALLOCREF_RADIX 12 /* log2(entries) */ + +#if (HAMMER2_ALLOCREF_BYTES * HAMMER2_ALLOCREF_ENTRIES) != HAMMER2_BUFSIZE +#error "allocref parameters do not fit in hammer buffer" +#endif +#if (1 << HAMMER2_ALLOCREF_RADIX) != HAMMER2_ALLOCREF_ENTRIES +#error "allocref parameters are inconsistent" +#endif + +#define HAMMER2_ALLOCREF_ALTMASK 0x0003 /* select block for recurse */ +#define HAMMER2_ALLOCREF_ALTA 0x0000 +#define HAMMER2_ALLOCREF_ALTB 0x0001 +#define HAMMER2_ALLOCREF_ALTC 0x0002 /* unsupported */ +#define HAMMER2_ALLOCREF_ALTD 0x0003 /* unsupported */ +#define HAMMER2_ALLOCREF_LEAF 0x0004 + +#endif + +/* + * Copies information stored in the volume header. Typically formatted + * e.g. like 'serno/A21343249.s1d' + * + * There are 8 copy_data[]'s in the volume header but up to 256 copyid's. + * When a copy is removed its copyid remains reserved in the copyid bitmap + * (copyexists[] bitmap in volume_data) until the copy references have + * been removed from the entire filesystem and cannot be reused until the + * removal is complete. However, new copy entries with other ids can be + * instantly added, replacing the original copy_data[]... which is fine as + * long as the copyid does not conflict. + * + * This structure must be exactly 64 bytes long. + */ +struct hammer2_copy_data { + uint8_t copyid; /* 0-255 */ + uint8_t flags; + uint8_t reserved02; + uint8_t reserved03; + uint8_t path[60]; /* up to 59-char string, nul-terminated */ +}; + +typedef struct hammer2_copy_data hammer2_copy_data_t; + +#define COPYDATAF_OUTOFSYNC 0x0001 + +/* + * The volume header eats a 64K block. There is currently an issue where + * we want to try to fit all nominal filesystem updates in a 512-byte section + * but it may be a lost cause due to the need for a blockset. + * + * All information is stored in host byte order. The volume header's magic + * number may be checked to determine the byte order. If you wish to mount + * between machines w/ different endian modes you'll need filesystem code + * which acts on the media data consistently (either all one way or all the + * other). Our code currently does not do that. + * + * A read-write mount may have to recover missing allocations by doing an + * incremental mirror scan looking for modifications made after alloc_tid. + * If alloc_tid == last_tid then no recovery operation is needed. Recovery + * operations are usually very, very fast. + * + * Read-only mounts do not need to do any recovery, access to the filesystem + * topology is always consistent after a crash (is always consistent, period). + * However, there may be shortcutted blockref updates present from deep in + * the tree which are stored in the volumeh eader and must be tracked on + * the fly. + * + * icrc_sect0 only applies to the first 512-4 bytes in the volume header. + * + * COPIES: Multiple copies may be specified on the mount line AND/OR you + * just specify one and the mount code tries to pick up the others + * from copyinfo[]. The copyid field in the volume header along + * with the fsid validates the copies. + * + * NOTE: root_blockref points to the super-root directory, not the root + * directory. The root directory will be a subdirectory under the + * super-root. + * + * The super-root directory contains all root directories and all + * snapshots (readonly or writable). It is possible to do a + * null-mount of the super-root using special path constructions + * relative to your mounted root. + * + * NOTE: HAMMER2 allows any subdirectory tree to be managed as if it were + * a PFS, including mirroring and storage quota operations, and this is + * prefered over creating discrete PFSs in the super-root. Instead + * the super-root is most typically used to create writable snapshots, + * alternative roots, and so forth. The super-root is also used by + * the automatic snapshotting mechanism. + */ +#define HAMMER2_VOLUME_ID_HBO 0x48414d3205172011LLU +#define HAMMER2_VOLUME_ID_ABO 0x11201705324d4148LLU + +struct hammer2_volume_data { + /* + * First 512-byte section + */ + uint64_t magic; /* 0000 Signature */ + hammer2_off_t boot_beg; /* 0008 Boot area (future) */ + hammer2_off_t boot_end; /* 0010 (size = end - beg) */ + hammer2_off_t redo_beg; /* 0018 Redo area (future) */ + hammer2_off_t redo_end; /* 0020 (size = end - beg) */ + hammer2_off_t volu_size; /* 0028 Volume size, bytes */ + + uint32_t version; /* 0030 */ + uint32_t flags; /* 0034 */ + uint8_t copyid; /* 0038 copyid of phys vol */ + uint8_t reserved0039; /* 0039 */ + uint8_t reserved003A; /* 003A */ + uint8_t reserved003B; /* 003B */ + uint32_t reserved003C; /* 003C */ + + uuid_t fsid; /* 0040 */ + uuid_t fstype; /* 0050 */ + + /* + * allocator_size is precalculated at newfs time and does not include + * reserved blocks, boot, or redo areas. + * + * Initial non-reserved-area allocations do not use the allocation + * map but instead adjust alloc_iterator. Dynamic allocations take + * over starting at (allocator_beg). This makes newfs_hammer2's + * job a lot easier and can also serve as a testing jig. + */ + hammer2_off_t allocator_size; /* 0060 Total data space */ + hammer2_off_t allocator_free; /* 0068 Free space */ + hammer2_tid_t allocator_beg; /* 0070 Initial allocations */ + hammer2_tid_t last_tid; /* 0078 Last transaction id */ + hammer2_tid_t alloc_tid; /* 0080 Alloctable modify tid */ + hammer2_blockref_t alloc_blockref; /* 0088-00C7 */ + + /* + * Copyids are allocated dynamically from the copyexists bitmap. + * An id from the active copies set (up to 8, see copyinfo later on) + * may still exist after the copy set has been removed from the + * volume header and its bit will remain active in the bitmap and + * cannot be reused until it is 100% removed from the hierarchy. + */ + uint32_t copyexists[8]; /* 00C8-00E7 copy exists bmap */ + char reserved0140[248]; /* 00E8-01DF */ + + /* + * 32 bit CRC array at the end of the first 512 byte sector. + * + * icrc_sects[7] - First 512-4 bytes of volume header (including all + * the other icrc's except the last one). + * + * icrc_sects[6] - Second 512-4 bytes of volume header, which is + * the blockset for the root. + */ + hammer2_crc32_t icrc_sects[8]; /* 01E0-01FF */ + + /* + * Second 512-byte section. + * + * The entire sector is used by a blockset. + */ + hammer2_blockset_t sroot_blockset; /* 0200 Superroot directory */ + + /* + * Third 512-byte section . + * + * This entire section contains copyinfo specifications, typically + * device serno specifications such as 'serno/.s1d'. Each + * element eats 64 bytes x 8 elements is 512 bytes. + */ + struct hammer2_copy_data copyinfo[8]; + + /* + * Remaining sections are reserved for future use. + */ + char reserved0400[0xFBFC]; /* 0400-FFFC reserved */ + + /* + * icrc on entire volume header + */ + hammer2_crc32_t icrc_volheader; /* FFFC-FFFF full volume icrc*/ +}; + +/* + * Section 0 and section 1 have their own iCRCs. Remaining icrc_sets[] + * entries are reserved for future use. + * + * icrc_volheader iCRCs the whole 64K volume header block and is catch-all + * for anything not individually iCRCd. + */ +#define HAMMER2_VOL_ICRC_SECT0 7 +#define HAMMER2_VOL_ICRC_SECT1 6 + + +#define HAMMER2_VOLUME_BYTES 65536 +#define HAMMER2_VOLUME_ICRCSIZE offsetof(hammer2_volume_data_t, icrc_sect0) + +#define HAMMER2_VOL_VERSION_MIN 1 +#define HAMMER2_VOL_VERSION_DEFAULT 1 +#define HAMMER2_VOL_VERSION_WIP 2 + +#define HAMMER2_NUM_VOLHDRS 4 + +/* + * Prototypes for user & kernel functions. Kernel-only prototypes are + * elsewhere. + */ +uint32_t hammer2_icrc32(const void *buf, size_t size); +uint32_t hammer2_icrc32c(const void *buf, size_t size, uint32_t crc); + +#endif diff --git a/sys/vfs/hammer2/hammer2_icrc.c b/sys/vfs/hammer2/hammer2_icrc.c new file mode 100644 index 0000000000..d18a8665a8 --- /dev/null +++ b/sys/vfs/hammer2/hammer2_icrc.c @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 2005-2010 Daniel Braniss + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + | iSCSI + | $Id: isc_subr.c 560 2009-05-07 07:37:49Z danny $ + */ + +#include +#include + +#include "hammer2_disk.h" + +/*****************************************************************/ +/* */ +/* CRC LOOKUP TABLE */ +/* ================ */ +/* The following CRC lookup table was generated automagically */ +/* by the Rocksoft^tm Model CRC Algorithm Table Generation */ +/* Program V1.0 using the following model parameters: */ +/* */ +/* Width : 4 bytes. */ +/* Poly : 0x1EDC6F41L */ +/* Reverse : TRUE. */ +/* */ +/* For more information on the Rocksoft^tm Model CRC Algorithm, */ +/* see the document titled "A Painless Guide to CRC Error */ +/* Detection Algorithms" by Ross Williams */ +/* (ross@guest.adelaide.edu.au.). This document is likely to be */ +/* in the FTP archive "ftp.adelaide.edu.au/pub/rocksoft". */ +/* */ +/*****************************************************************/ + +static uint32_t crc32Table[256] = { + 0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L, + 0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL, + 0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL, + 0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L, + 0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL, + 0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L, + 0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L, + 0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL, + 0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL, + 0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L, + 0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L, + 0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL, + 0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L, + 0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL, + 0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL, + 0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L, + 0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L, + 0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L, + 0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L, + 0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L, + 0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L, + 0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L, + 0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L, + 0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L, + 0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L, + 0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L, + 0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L, + 0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L, + 0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L, + 0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L, + 0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L, + 0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L, + 0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL, + 0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L, + 0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L, + 0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL, + 0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L, + 0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL, + 0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL, + 0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L, + 0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L, + 0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL, + 0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL, + 0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L, + 0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL, + 0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L, + 0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L, + 0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL, + 0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L, + 0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL, + 0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL, + 0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L, + 0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL, + 0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L, + 0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L, + 0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL, + 0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL, + 0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L, + 0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L, + 0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL, + 0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L, + 0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL, + 0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL, + 0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L +}; + +uint32_t +hammer2_icrc32(const void *buf, size_t size) +{ + const uint8_t *p = buf; + uint32_t crc = 0; + + crc = crc ^ 0xffffffff; + while (size--) + crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + crc = crc ^ 0xffffffff; + return crc; +} + +uint32_t +hammer2_icrc32c(const void *buf, size_t size, uint32_t crc) +{ + const uint8_t *p = buf; + + crc = crc ^ 0xffffffff; + while (size--) + crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + crc = crc ^ 0xffffffff; + return crc; +} diff --git a/sys/vfs/hammer2/hammer2_mount.h b/sys/vfs/hammer2/hammer2_mount.h new file mode 100644 index 0000000000..be5db5fa10 --- /dev/null +++ b/sys/vfs/hammer2/hammer2_mount.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2011 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _VFS_HAMMER2_MOUNT_H_ +#define _VFS_HAMMER2_MOUNT_H_ + +/* + * This structure is passed from userland to the kernel during the mount + * system call. + * + * The volume name is formatted as '/dev/ad0s1a@LABEL', where the label is + * the mount point under the super-root. + */ +struct hammer2_mount_info { + const char *volume; + int hflags; /* extended hammer mount flags */ + int unused01; + char reserved1[112]; +}; + +#define HMNT2_NOAUTOSNAP 0x00000001 + +#define HMNT2_USERFLAGS (HMNT2_NOAUTOSNAP) + +#endif diff --git a/sys/vfs/hammer2/hammer2_subr.c b/sys/vfs/hammer2/hammer2_subr.c new file mode 100644 index 0000000000..54f9a5ce3d --- /dev/null +++ b/sys/vfs/hammer2/hammer2_subr.c @@ -0,0 +1,1542 @@ +#include +#include +#include +#include +#include +#include +#include + +#include "hammer2.h" + +/* + * HAMMER2 inode locks + * + * HAMMER2 offers shared locks, update locks, and exclusive locks on inodes. + * + * Shared locks allow concurrent access to an inode's fields, but exclude + * access by concurrent exclusive locks. + * + * Update locks are interesting -- an update lock will be taken after all + * shared locks on an inode are released, but once it is in place, shared + * locks may proceed. The update field is signalled by a busy flag in the + * inode. Only one update lock may be in place at a given time on an inode. + * + * Exclusive locks prevent concurrent access to the inode. + * + * XXX: What do we use each for? How is visibility to the inode controlled? + */ + +void hammer2_inode_lock_sh(struct hammer2_inode *ip) +{ + lockmgr(&ip->hi_lk, LK_SHARED); +} + +void hammer2_inode_lock_up(struct hammer2_inode *ip) +{ + lockmgr(&ip->hi_lk, LK_EXCLUSIVE); + ++ip->hi_busy; + lockmgr(&ip->hi_lk, LK_DOWNGRADE); +} + +void hammer2_inode_lock_ex(struct hammer2_inode *ip) +{ + lockmgr(&ip->hi_lk, LK_EXCLUSIVE); +} + +void hammer2_inode_unlock_ex(struct hammer2_inode *ip) +{ + lockmgr(&ip->hi_lk, LK_RELEASE); +} + +void hammer2_inode_unlock_up(struct hammer2_inode *ip) +{ + lockmgr(&ip->hi_lk, LK_UPGRADE); + --ip->hi_busy; + lockmgr(&ip->hi_lk, LK_RELEASE); +} + +void hammer2_inode_unlock_sh(struct hammer2_inode *ip) +{ + lockmgr(&ip->hi_lk, LK_RELEASE); +} + +/* + * Mount-wide locks + */ + +void +hammer2_mount_exlock(struct hammer2_mount *hmp) +{ + lockmgr(&hmp->hm_lk, LK_EXCLUSIVE); +} + +void +hammer2_mount_shlock(struct hammer2_mount *hmp) +{ + lockmgr(&hmp->hm_lk, LK_SHARED); +} + +void +hammer2_mount_unlock(struct hammer2_mount *hmp) +{ + lockmgr(&hmp->hm_lk, LK_RELEASE); +} + +/* + * Inode/vnode subroutines + */ + +/* + * igetv: + * + * Get a vnode associated with the given inode. If one exists, return it, + * locked and ref-ed. Otherwise, a new vnode is allocated and associated + * with the vnode. + * + * The lock prevents the inode from being reclaimed, I believe (XXX) + */ +struct vnode * +igetv(struct hammer2_inode *ip, int *error) +{ + struct vnode *vp; + struct hammer2_mount *hmp; + int rc; + + hmp = ip->hi_mp; + rc = 0; + + kprintf("igetv\n"); + tsleep(&igetv, 0, "", hz * 10); + + hammer2_inode_lock_ex(ip); + do { + /* Reuse existing vnode */ + vp = ip->hi_vnode; + if (vp) { + /* XXX: Is this necessary? */ + vx_lock(vp); + break; + } + + /* Allocate and initialize a new vnode */ + rc = getnewvnode(VT_HAMMER2, H2TOMP(hmp), &vp, + VLKTIMEOUT, LK_CANRECURSE); + if (rc) { + vp = NULL; + break; + } + + kprintf("igetv new\n"); + switch (ip->type & HAMMER2_INODE_TYPE_MASK) { + case HAMMER2_INODE_DIR: + vp->v_type = VDIR; + break; + case HAMMER2_INODE_FILE: + vp->v_type = VREG; + /*XXX: Init w/ true file size; 0*/ + vinitvmio(vp, 0, PAGE_SIZE, -1); + break; + default: + break; + } + + if (ip->type & HAMMER2_INODE_ROOT) + vsetflags(vp, VROOT); + + vp->v_data = ip; + ip->hi_vnode = vp; + } while (0); + hammer2_inode_unlock_ex(ip); + + /* + * XXX: Under what conditions can a vnode be reclaimed? How do we want + * to interlock against vreclaim calls into hammer2? When do we need to? + */ + + kprintf("igetv exit\n"); + + /* vp is either NULL or a locked, ref-ed vnode referring to inode ip */ + *error = rc; + return (vp); +} + +/* + * alloci: + * + * Allocate an inode in a HAMMER2 mount. The returned inode is locked + * exclusively. The HAMMER2 mountpoint must be locked on entry. + */ +struct hammer2_inode *alloci(struct hammer2_mount *hmp) { + struct hammer2_inode *ip; + + kprintf("alloci\n"); + + ip = kmalloc(sizeof(struct hammer2_inode), hmp->hm_inodes, + M_WAITOK | M_ZERO); + if (!ip) { + /* XXX */ + } + + ++hmp->hm_ninodes; + + ip->type = 0; + ip->hi_mp = hmp; + lockinit(&ip->hi_lk, "h2inode", 0, 0); + ip->hi_vnode = NULL; + + hammer2_inode_lock_ex(ip); + + return (ip); +} + +/* $NetBSD: tmpfs_subr.c,v 1.35 2007/07/09 21:10:50 ad Exp $ */ + +/*- + * Copyright (c) 2005 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Julio M. Merino Vidal, developed as part of Google's Summer of Code + * 2005 program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Efficient memory file system supporting functions. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "hammer2.h" + +static ino_t t_ino = 2; +static struct spinlock ino_lock; +static ino_t tmpfs_fetch_ino(void); + +/* --------------------------------------------------------------------- */ + +/* + * Allocates a new node of type 'type' inside the 'tmp' mount point, with + * its owner set to 'uid', its group to 'gid' and its mode set to 'mode', + * using the credentials of the process 'p'. + * + * If the node type is set to 'VDIR', then the parent parameter must point + * to the parent directory of the node being created. It may only be NULL + * while allocating the root node. + * + * If the node type is set to 'VBLK' or 'VCHR', then the rdev parameter + * specifies the device the node represents. + * + * If the node type is set to 'VLNK', then the parameter target specifies + * the file name of the target file for the symbolic link that is being + * created. + * + * Note that new nodes are retrieved from the available list if it has + * items or, if it is empty, from the node pool as long as there is enough + * space to create them. + * + * Returns zero on success or an appropriate error code on failure. + */ +int +tmpfs_alloc_node(struct hammer2_mount *tmp, enum vtype type, + uid_t uid, gid_t gid, mode_t mode, struct tmpfs_node *parent, + char *target, int rmajor, int rminor, struct tmpfs_node **node) +{ + struct tmpfs_node *nnode; + struct timespec ts; + udev_t rdev; + + /* If the root directory of the 'tmp' file system is not yet + * allocated, this must be the request to do it. */ + KKASSERT(IMPLIES(tmp->tm_root == NULL, parent == NULL && type == VDIR)); + + KKASSERT(IFF(type == VLNK, target != NULL)); + KKASSERT(IFF(type == VBLK || type == VCHR, rmajor != VNOVAL)); + + if (tmp->tm_nodes_inuse >= tmp->tm_nodes_max) + return (ENOSPC); + + nnode = objcache_get(tmp->tm_node_pool, M_WAITOK | M_NULLOK); + if (nnode == NULL) + return (ENOSPC); + + /* Generic initialization. */ + nnode->tn_type = type; + vfs_timestamp(&ts); + nnode->tn_ctime = nnode->tn_mtime = nnode->tn_atime + = ts.tv_sec; + nnode->tn_ctimensec = nnode->tn_mtimensec = nnode->tn_atimensec + = ts.tv_nsec; + nnode->tn_uid = uid; + nnode->tn_gid = gid; + nnode->tn_mode = mode; + nnode->tn_id = tmpfs_fetch_ino(); + nnode->tn_advlock.init_done = 0; + + /* Type-specific initialization. */ + switch (nnode->tn_type) { + case VBLK: + case VCHR: + rdev = makeudev(rmajor, rminor); + if (rdev == NOUDEV) { + objcache_put(tmp->tm_node_pool, nnode); + return(EINVAL); + } + nnode->tn_rdev = rdev; + break; + + case VDIR: + TAILQ_INIT(&nnode->tn_dir.tn_dirhead); + KKASSERT(parent != nnode); + KKASSERT(IMPLIES(parent == NULL, tmp->tm_root == NULL)); + nnode->tn_dir.tn_parent = parent; + nnode->tn_dir.tn_readdir_lastn = 0; + nnode->tn_dir.tn_readdir_lastp = NULL; + nnode->tn_links++; + nnode->tn_size = 0; + if (parent) { + TMPFS_NODE_LOCK(parent); + parent->tn_links++; + TMPFS_NODE_UNLOCK(parent); + } + break; + + case VFIFO: + /* FALLTHROUGH */ + case VSOCK: + break; + + case VLNK: + nnode->tn_size = strlen(target); + nnode->tn_link = kmalloc(nnode->tn_size + 1, tmp->tm_name_zone, + M_WAITOK | M_NULLOK); + if (nnode->tn_link == NULL) { + objcache_put(tmp->tm_node_pool, nnode); + return (ENOSPC); + } + bcopy(target, nnode->tn_link, nnode->tn_size); + nnode->tn_link[nnode->tn_size] = '\0'; + break; + + case VREG: + nnode->tn_reg.tn_aobj = + swap_pager_alloc(NULL, 0, VM_PROT_DEFAULT, 0); + nnode->tn_reg.tn_aobj_pages = 0; + nnode->tn_size = 0; + break; + + default: + panic("tmpfs_alloc_node: type %p %d", nnode, (int)nnode->tn_type); + } + + TMPFS_NODE_LOCK(nnode); + TMPFS_LOCK(tmp); + LIST_INSERT_HEAD(&tmp->tm_nodes_used, nnode, tn_entries); + tmp->tm_nodes_inuse++; + TMPFS_UNLOCK(tmp); + TMPFS_NODE_UNLOCK(nnode); + + *node = nnode; + return 0; +} + +/* --------------------------------------------------------------------- */ + +/* + * Destroys the node pointed to by node from the file system 'tmp'. + * If the node does not belong to the given mount point, the results are + * unpredicted. + * + * If the node references a directory; no entries are allowed because + * their removal could need a recursive algorithm, something forbidden in + * kernel space. Furthermore, there is not need to provide such + * functionality (recursive removal) because the only primitives offered + * to the user are the removal of empty directories and the deletion of + * individual files. + * + * Note that nodes are not really deleted; in fact, when a node has been + * allocated, it cannot be deleted during the whole life of the file + * system. Instead, they are moved to the available list and remain there + * until reused. + */ +void +tmpfs_free_node(struct hammer2_mount *tmp, struct tmpfs_node *node) +{ + vm_pindex_t pages = 0; + +#ifdef INVARIANTS + TMPFS_ASSERT_ELOCKED(node); + KKASSERT(node->tn_vnode == NULL); + KKASSERT((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0); +#endif + + TMPFS_LOCK(tmp); + LIST_REMOVE(node, tn_entries); + tmp->tm_nodes_inuse--; + TMPFS_UNLOCK(tmp); + TMPFS_NODE_UNLOCK(node); + + switch (node->tn_type) { + case VNON: + /* Do not do anything. VNON is provided to let the + * allocation routine clean itself easily by avoiding + * duplicating code in it. */ + /* FALLTHROUGH */ + case VBLK: + /* FALLTHROUGH */ + case VCHR: + /* FALLTHROUGH */ + break; + case VDIR: + /* + * The parent link can be NULL if this is the root + * node. + */ + node->tn_links--; + node->tn_size = 0; + KKASSERT(node->tn_dir.tn_parent || node == tmp->tm_root); + if (node->tn_dir.tn_parent) { + TMPFS_NODE_LOCK(node->tn_dir.tn_parent); + node->tn_dir.tn_parent->tn_links--; + + /* + * If the parent directory has no more links and + * no vnode ref nothing is going to come along + * and clean it up unless we do it here. + */ + if (node->tn_dir.tn_parent->tn_links == 0 && + node->tn_dir.tn_parent->tn_vnode == NULL) { + tmpfs_free_node(tmp, node->tn_dir.tn_parent); + /* eats parent lock */ + } else { + TMPFS_NODE_UNLOCK(node->tn_dir.tn_parent); + } + node->tn_dir.tn_parent = NULL; + } + + /* + * If the root node is being destroyed don't leave a + * dangling pointer in hammer2_mount. + */ + if (node == tmp->tm_root) + tmp->tm_root = NULL; + break; + case VFIFO: + /* FALLTHROUGH */ + case VSOCK: + break; + + case VLNK: + kfree(node->tn_link, tmp->tm_name_zone); + node->tn_link = NULL; + node->tn_size = 0; + break; + + case VREG: + if (node->tn_reg.tn_aobj != NULL) + vm_object_deallocate(node->tn_reg.tn_aobj); + node->tn_reg.tn_aobj = NULL; + pages = node->tn_reg.tn_aobj_pages; + break; + + default: + panic("tmpfs_free_node: type %p %d", node, (int)node->tn_type); + } + + /* + * Clean up fields for the next allocation. The objcache only ctors + * new allocations. + */ + tmpfs_node_ctor(node, NULL, 0); + objcache_put(tmp->tm_node_pool, node); + /* node is now invalid */ + + TMPFS_LOCK(tmp); + tmp->tm_pages_used -= pages; + TMPFS_UNLOCK(tmp); +} + +/* --------------------------------------------------------------------- */ + +/* + * Allocates a new directory entry for the node node with a name of name. + * The new directory entry is returned in *de. + * + * The link count of node is increased by one to reflect the new object + * referencing it. + * + * Returns zero on success or an appropriate error code on failure. + */ +int +tmpfs_alloc_dirent(struct hammer2_mount *tmp, struct tmpfs_node *node, + const char *name, uint16_t len, struct tmpfs_dirent **de) +{ + struct tmpfs_dirent *nde; + + nde = objcache_get(tmp->tm_dirent_pool, M_WAITOK); + nde->td_name = kmalloc(len + 1, tmp->tm_name_zone, M_WAITOK | M_NULLOK); + if (nde->td_name == NULL) { + objcache_put(tmp->tm_dirent_pool, nde); + *de = NULL; + return (ENOSPC); + } + nde->td_namelen = len; + bcopy(name, nde->td_name, len); + nde->td_name[len] = '\0'; + + nde->td_node = node; + + TMPFS_NODE_LOCK(node); + node->tn_links++; + TMPFS_NODE_UNLOCK(node); + + *de = nde; + + return 0; +} + +/* --------------------------------------------------------------------- */ + +/* + * Frees a directory entry. It is the caller's responsibility to destroy + * the node referenced by it if needed. + * + * The link count of node is decreased by one to reflect the removal of an + * object that referenced it. This only happens if 'node_exists' is true; + * otherwise the function will not access the node referred to by the + * directory entry, as it may already have been released from the outside. + */ +void +tmpfs_free_dirent(struct hammer2_mount *tmp, struct tmpfs_dirent *de) +{ + struct tmpfs_node *node; + + node = de->td_node; + + TMPFS_NODE_LOCK(node); + TMPFS_ASSERT_ELOCKED(node); + KKASSERT(node->tn_links > 0); + node->tn_links--; + TMPFS_NODE_UNLOCK(node); + + kfree(de->td_name, tmp->tm_name_zone); + de->td_namelen = 0; + de->td_name = NULL; + de->td_node = NULL; + objcache_put(tmp->tm_dirent_pool, de); +} + +/* --------------------------------------------------------------------- */ + +/* + * Allocates a new vnode for the node node or returns a new reference to + * an existing one if the node had already a vnode referencing it. The + * resulting locked vnode is returned in *vpp. + * + * Returns zero on success or an appropriate error code on failure. + */ +int +tmpfs_alloc_vp(struct mount *mp, struct tmpfs_node *node, int lkflag, + struct vnode **vpp) +{ + int error = 0; + struct vnode *vp; + +loop: + /* + * Interlocked extraction from node. This can race many things. + * We have to get a soft reference on the vnode while we hold + * the node locked, then acquire it properly and check for races. + */ + TMPFS_NODE_LOCK(node); + if ((vp = node->tn_vnode) != NULL) { + KKASSERT((node->tn_vpstate & TMPFS_VNODE_DOOMED) == 0); + vhold_interlocked(vp); + TMPFS_NODE_UNLOCK(node); + + if (vget(vp, lkflag | LK_EXCLUSIVE) != 0) { + vdrop(vp); + goto loop; + } + if (node->tn_vnode != vp) { + vput(vp); + vdrop(vp); + goto loop; + } + vdrop(vp); + goto out; + } + /* vp is NULL */ + + /* + * This should never happen. + */ + if (node->tn_vpstate & TMPFS_VNODE_DOOMED) { + TMPFS_NODE_UNLOCK(node); + error = ENOENT; + goto out; + } + + /* + * Interlock against other calls to tmpfs_alloc_vp() trying to + * allocate and assign a vp to node. + */ + if (node->tn_vpstate & TMPFS_VNODE_ALLOCATING) { + node->tn_vpstate |= TMPFS_VNODE_WANT; + error = tsleep(&node->tn_vpstate, PINTERLOCKED | PCATCH, + "tmpfs_alloc_vp", 0); + TMPFS_NODE_UNLOCK(node); + if (error) + return error; + goto loop; + } + node->tn_vpstate |= TMPFS_VNODE_ALLOCATING; + TMPFS_NODE_UNLOCK(node); + + /* + * Allocate a new vnode (may block). The ALLOCATING flag should + * prevent a race against someone else assigning node->tn_vnode. + */ + error = getnewvnode(VT_TMPFS, mp, &vp, VLKTIMEOUT, LK_CANRECURSE); + if (error != 0) + goto unlock; + + KKASSERT(node->tn_vnode == NULL); + KKASSERT(vp != NULL); + vp->v_data = node; + vp->v_type = node->tn_type; + + /* Type-specific initialization. */ + switch (node->tn_type) { + case VBLK: + /* FALLTHROUGH */ + case VCHR: + /* FALLTHROUGH */ + case VSOCK: + break; + case VREG: + vinitvmio(vp, node->tn_size, BMASK, -1); + break; + case VLNK: + break; + case VFIFO: + vp->v_ops = &mp->mnt_vn_fifo_ops; + break; + case VDIR: + break; + + default: + panic("tmpfs_alloc_vp: type %p %d", node, (int)node->tn_type); + } + + insmntque(vp, mp); + +unlock: + TMPFS_NODE_LOCK(node); + + KKASSERT(node->tn_vpstate & TMPFS_VNODE_ALLOCATING); + node->tn_vpstate &= ~TMPFS_VNODE_ALLOCATING; + node->tn_vnode = vp; + + if (node->tn_vpstate & TMPFS_VNODE_WANT) { + node->tn_vpstate &= ~TMPFS_VNODE_WANT; + TMPFS_NODE_UNLOCK(node); + wakeup(&node->tn_vpstate); + } else { + TMPFS_NODE_UNLOCK(node); + } + +out: + *vpp = vp; + + KKASSERT(IFF(error == 0, *vpp != NULL && vn_islocked(*vpp))); +#ifdef INVARIANTS + TMPFS_NODE_LOCK(node); + KKASSERT(*vpp == node->tn_vnode); + TMPFS_NODE_UNLOCK(node); +#endif + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Destroys the association between the vnode vp and the node it + * references. + */ +void +tmpfs_free_vp(struct vnode *vp) +{ + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + TMPFS_NODE_LOCK(node); + KKASSERT(lockcount(TMPFS_NODE_MTX(node)) > 0); + node->tn_vnode = NULL; + TMPFS_NODE_UNLOCK(node); + vp->v_data = NULL; +} + +/* --------------------------------------------------------------------- */ + +/* + * Allocates a new file of type 'type' and adds it to the parent directory + * 'dvp'; this addition is done using the component name given in 'cnp'. + * The ownership of the new file is automatically assigned based on the + * credentials of the caller (through 'cnp'), the group is set based on + * the parent directory and the mode is determined from the 'vap' argument. + * If successful, *vpp holds a vnode to the newly created file and zero + * is returned. Otherwise *vpp is NULL and the function returns an + * appropriate error code. + */ +int +tmpfs_alloc_file(struct vnode *dvp, struct vnode **vpp, struct vattr *vap, + struct namecache *ncp, struct ucred *cred, char *target) +{ + int error; + struct tmpfs_dirent *de; + struct hammer2_mount *tmp; + struct tmpfs_node *dnode; + struct tmpfs_node *node; + struct tmpfs_node *parent; + + tmp = VFS_TO_TMPFS(dvp->v_mount); + dnode = VP_TO_TMPFS_DIR(dvp); + *vpp = NULL; + + /* If the entry we are creating is a directory, we cannot overflow + * the number of links of its parent, because it will get a new + * link. */ + if (vap->va_type == VDIR) { + /* Ensure that we do not overflow the maximum number of links + * imposed by the system. */ + KKASSERT(dnode->tn_links <= LINK_MAX); + if (dnode->tn_links == LINK_MAX) { + return EMLINK; + } + + parent = dnode; + KKASSERT(parent != NULL); + } else + parent = NULL; + + /* Allocate a node that represents the new file. */ + error = tmpfs_alloc_node(tmp, vap->va_type, cred->cr_uid, + dnode->tn_gid, vap->va_mode, parent, target, vap->va_rmajor, vap->va_rminor, &node); + if (error != 0) + return error; + TMPFS_NODE_LOCK(node); + + /* Allocate a directory entry that points to the new file. */ + error = tmpfs_alloc_dirent(tmp, node, ncp->nc_name, ncp->nc_nlen, &de); + if (error != 0) { + tmpfs_free_node(tmp, node); + /* eats node lock */ + return error; + } + + /* Allocate a vnode for the new file. */ + error = tmpfs_alloc_vp(dvp->v_mount, node, LK_EXCLUSIVE, vpp); + if (error != 0) { + tmpfs_free_dirent(tmp, de); + tmpfs_free_node(tmp, node); + /* eats node lock */ + return error; + } + + /* Now that all required items are allocated, we can proceed to + * insert the new node into the directory, an operation that + * cannot fail. */ + tmpfs_dir_attach(dnode, de); + TMPFS_NODE_UNLOCK(node); + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Attaches the directory entry de to the directory represented by vp. + * Note that this does not change the link count of the node pointed by + * the directory entry, as this is done by tmpfs_alloc_dirent. + */ +void +tmpfs_dir_attach(struct tmpfs_node *dnode, struct tmpfs_dirent *de) +{ + TMPFS_NODE_LOCK(dnode); + TAILQ_INSERT_TAIL(&dnode->tn_dir.tn_dirhead, de, td_entries); + + TMPFS_ASSERT_ELOCKED(dnode); + dnode->tn_size += sizeof(struct tmpfs_dirent); + dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | + TMPFS_NODE_MODIFIED; + TMPFS_NODE_UNLOCK(dnode); +} + +/* --------------------------------------------------------------------- */ + +/* + * Detaches the directory entry de from the directory represented by vp. + * Note that this does not change the link count of the node pointed by + * the directory entry, as this is done by tmpfs_free_dirent. + */ +void +tmpfs_dir_detach(struct tmpfs_node *dnode, struct tmpfs_dirent *de) +{ + TMPFS_NODE_LOCK(dnode); + if (dnode->tn_dir.tn_readdir_lastp == de) { + dnode->tn_dir.tn_readdir_lastn = 0; + dnode->tn_dir.tn_readdir_lastp = NULL; + } + TAILQ_REMOVE(&dnode->tn_dir.tn_dirhead, de, td_entries); + + TMPFS_ASSERT_ELOCKED(dnode); + dnode->tn_size -= sizeof(struct tmpfs_dirent); + dnode->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | + TMPFS_NODE_MODIFIED; + TMPFS_NODE_UNLOCK(dnode); +} + +/* --------------------------------------------------------------------- */ + +/* + * Looks for a directory entry in the directory represented by node. + * 'ncp' describes the name of the entry to look for. Note that the . + * and .. components are not allowed as they do not physically exist + * within directories. + * + * Returns a pointer to the entry when found, otherwise NULL. + */ +struct tmpfs_dirent * +tmpfs_dir_lookup(struct tmpfs_node *node, struct tmpfs_node *f, + struct namecache *ncp) +{ + struct tmpfs_dirent *de; + int len = ncp->nc_nlen; + + TMPFS_VALIDATE_DIR(node); + + TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) { + if (f != NULL && de->td_node != f) + continue; + if (len == de->td_namelen) { + if (!memcmp(ncp->nc_name, de->td_name, len)) + break; + } + } + + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_ACCESSED; + TMPFS_NODE_UNLOCK(node); + + return de; +} + +/* --------------------------------------------------------------------- */ + +/* + * Helper function for tmpfs_readdir. Creates a '.' entry for the given + * directory and returns it in the uio space. The function returns 0 + * on success, -1 if there was not enough space in the uio structure to + * hold the directory entry or an appropriate error code if another + * error happens. + */ +int +tmpfs_dir_getdotdent(struct tmpfs_node *node, struct uio *uio) +{ + int error; + struct dirent dent; + int dirsize; + + TMPFS_VALIDATE_DIR(node); + KKASSERT(uio->uio_offset == TMPFS_DIRCOOKIE_DOT); + + dent.d_ino = node->tn_id; + dent.d_type = DT_DIR; + dent.d_namlen = 1; + dent.d_name[0] = '.'; + dent.d_name[1] = '\0'; + dirsize = _DIRENT_DIRSIZ(&dent); + + if (dirsize > uio->uio_resid) + error = -1; + else { + error = uiomove((caddr_t)&dent, dirsize, uio); + if (error == 0) + uio->uio_offset = TMPFS_DIRCOOKIE_DOTDOT; + } + + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_ACCESSED; + TMPFS_NODE_UNLOCK(node); + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Helper function for tmpfs_readdir. Creates a '..' entry for the given + * directory and returns it in the uio space. The function returns 0 + * on success, -1 if there was not enough space in the uio structure to + * hold the directory entry or an appropriate error code if another + * error happens. + */ +int +tmpfs_dir_getdotdotdent(struct hammer2_mount *tmp, struct tmpfs_node *node, + struct uio *uio) +{ + int error; + struct dirent dent; + int dirsize; + + TMPFS_VALIDATE_DIR(node); + KKASSERT(uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT); + + if (node->tn_dir.tn_parent) { + TMPFS_NODE_LOCK(node->tn_dir.tn_parent); + dent.d_ino = node->tn_dir.tn_parent->tn_id; + TMPFS_NODE_UNLOCK(node->tn_dir.tn_parent); + } else { + dent.d_ino = tmp->tm_root->tn_id; + } + + dent.d_type = DT_DIR; + dent.d_namlen = 2; + dent.d_name[0] = '.'; + dent.d_name[1] = '.'; + dent.d_name[2] = '\0'; + dirsize = _DIRENT_DIRSIZ(&dent); + + if (dirsize > uio->uio_resid) + error = -1; + else { + error = uiomove((caddr_t)&dent, dirsize, uio); + if (error == 0) { + struct tmpfs_dirent *de; + + de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); + if (de == NULL) + uio->uio_offset = TMPFS_DIRCOOKIE_EOF; + else + uio->uio_offset = tmpfs_dircookie(de); + } + } + + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_ACCESSED; + TMPFS_NODE_UNLOCK(node); + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Lookup a directory entry by its associated cookie. + */ +struct tmpfs_dirent * +tmpfs_dir_lookupbycookie(struct tmpfs_node *node, off_t cookie) +{ + struct tmpfs_dirent *de; + + if (cookie == node->tn_dir.tn_readdir_lastn && + node->tn_dir.tn_readdir_lastp != NULL) { + return node->tn_dir.tn_readdir_lastp; + } + + TAILQ_FOREACH(de, &node->tn_dir.tn_dirhead, td_entries) { + if (tmpfs_dircookie(de) == cookie) { + break; + } + } + + return de; +} + +/* --------------------------------------------------------------------- */ + +/* + * Helper function for tmpfs_readdir. Returns as much directory entries + * as can fit in the uio space. The read starts at uio->uio_offset. + * The function returns 0 on success, -1 if there was not enough space + * in the uio structure to hold the directory entry or an appropriate + * error code if another error happens. + */ +int +tmpfs_dir_getdents(struct tmpfs_node *node, struct uio *uio, off_t *cntp) +{ + int error; + off_t startcookie; + struct tmpfs_dirent *de; + + TMPFS_VALIDATE_DIR(node); + + /* Locate the first directory entry we have to return. We have cached + * the last readdir in the node, so use those values if appropriate. + * Otherwise do a linear scan to find the requested entry. */ + startcookie = uio->uio_offset; + KKASSERT(startcookie != TMPFS_DIRCOOKIE_DOT); + KKASSERT(startcookie != TMPFS_DIRCOOKIE_DOTDOT); + if (startcookie == TMPFS_DIRCOOKIE_EOF) { + return 0; + } else { + de = tmpfs_dir_lookupbycookie(node, startcookie); + } + if (de == NULL) { + return EINVAL; + } + + /* Read as much entries as possible; i.e., until we reach the end of + * the directory or we exhaust uio space. */ + do { + struct dirent d; + int reclen; + + /* Create a dirent structure representing the current + * tmpfs_node and fill it. */ + d.d_ino = de->td_node->tn_id; + switch (de->td_node->tn_type) { + case VBLK: + d.d_type = DT_BLK; + break; + + case VCHR: + d.d_type = DT_CHR; + break; + + case VDIR: + d.d_type = DT_DIR; + break; + + case VFIFO: + d.d_type = DT_FIFO; + break; + + case VLNK: + d.d_type = DT_LNK; + break; + + case VREG: + d.d_type = DT_REG; + break; + + case VSOCK: + d.d_type = DT_SOCK; + break; + + default: + panic("tmpfs_dir_getdents: type %p %d", + de->td_node, (int)de->td_node->tn_type); + } + d.d_namlen = de->td_namelen; + KKASSERT(de->td_namelen < sizeof(d.d_name)); + bcopy(de->td_name, d.d_name, d.d_namlen); + d.d_name[d.d_namlen] = '\0'; + reclen = _DIRENT_RECLEN(d.d_namlen); + + /* Stop reading if the directory entry we are treating is + * bigger than the amount of data that can be returned. */ + if (reclen > uio->uio_resid) { + error = -1; + break; + } + + /* Copy the new dirent structure into the output buffer and + * advance pointers. */ + error = uiomove((caddr_t)&d, reclen, uio); + + (*cntp)++; + de = TAILQ_NEXT(de, td_entries); + } while (error == 0 && uio->uio_resid > 0 && de != NULL); + + /* Update the offset and cache. */ + if (de == NULL) { + uio->uio_offset = TMPFS_DIRCOOKIE_EOF; + node->tn_dir.tn_readdir_lastn = 0; + node->tn_dir.tn_readdir_lastp = NULL; + } else { + node->tn_dir.tn_readdir_lastn = uio->uio_offset = tmpfs_dircookie(de); + node->tn_dir.tn_readdir_lastp = de; + } + node->tn_status |= TMPFS_NODE_ACCESSED; + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Resizes the aobj associated to the regular file pointed to by vp to + * the size newsize. 'vp' must point to a vnode that represents a regular + * file. 'newsize' must be positive. + * + * pass trivial as 1 when buf content will be overwritten, otherwise set 0 + * to be zero filled. + * + * Returns zero on success or an appropriate error code on failure. + */ +int +tmpfs_reg_resize(struct vnode *vp, off_t newsize, int trivial) +{ + int error; + vm_pindex_t newpages, oldpages; + struct hammer2_mount *tmp; + struct tmpfs_node *node; + off_t oldsize; + +#ifdef INVARIANTS + KKASSERT(vp->v_type == VREG); + KKASSERT(newsize >= 0); +#endif + + node = VP_TO_TMPFS_NODE(vp); + tmp = VFS_TO_TMPFS(vp->v_mount); + + /* Convert the old and new sizes to the number of pages needed to + * store them. It may happen that we do not need to do anything + * because the last allocated page can accommodate the change on + * its own. */ + oldsize = node->tn_size; + oldpages = round_page64(oldsize) / PAGE_SIZE; + KKASSERT(oldpages == node->tn_reg.tn_aobj_pages); + newpages = round_page64(newsize) / PAGE_SIZE; + + if (newpages > oldpages && + tmp->tm_pages_used + newpages - oldpages > tmp->tm_pages_max) { + error = ENOSPC; + goto out; + } + + TMPFS_LOCK(tmp); + tmp->tm_pages_used += (newpages - oldpages); + TMPFS_UNLOCK(tmp); + + TMPFS_NODE_LOCK(node); + node->tn_reg.tn_aobj_pages = newpages; + node->tn_size = newsize; + TMPFS_NODE_UNLOCK(node); + + /* + * When adjusting the vnode filesize and its VM object we must + * also adjust our backing VM object (aobj). The blocksize + * used must match the block sized we use for the buffer cache. + * + * The backing VM object contains no VM pages, only swap + * assignments. + */ + if (newsize < oldsize) { + vm_pindex_t osize; + vm_pindex_t nsize; + vm_object_t aobj; + + error = nvtruncbuf(vp, newsize, BSIZE, -1); + aobj = node->tn_reg.tn_aobj; + if (aobj) { + osize = aobj->size; + nsize = vp->v_object->size; + if (nsize < osize) { + aobj->size = osize; + swap_pager_freespace(aobj, nsize, + osize - nsize); + } + } + } else { + vm_object_t aobj; + + error = nvextendbuf(vp, oldsize, newsize, BSIZE, BSIZE, + -1, -1, trivial); + aobj = node->tn_reg.tn_aobj; + if (aobj) + aobj->size = vp->v_object->size; + } + +out: + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Change flags of the given vnode. + * Caller should execute tmpfs_update on vp after a successful execution. + * The vnode must be locked on entry and remain locked on exit. + */ +int +tmpfs_chflags(struct vnode *vp, int vaflags, struct ucred *cred) +{ + int error; + struct tmpfs_node *node; + int flags; + + KKASSERT(vn_islocked(vp)); + + node = VP_TO_TMPFS_NODE(vp); + flags = node->tn_flags; + + /* Disallow this operation if the file system is mounted read-only. */ + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return EROFS; + error = vop_helper_setattr_flags(&flags, vaflags, node->tn_uid, cred); + + /* + * Unprivileged processes are not permitted to unset system + * flags, or modify flags if any system flags are set. + * + * Silently enforce SF_NOCACHE on the root tmpfs vnode so + * tmpfs data is not double-cached by swapcache. + */ + if (error == 0) { + TMPFS_NODE_LOCK(node); + if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0)) { + if (vp->v_flag & VROOT) + flags |= SF_NOCACHE; + node->tn_flags = flags; + } else { + if (node->tn_flags & (SF_NOUNLINK | SF_IMMUTABLE | + SF_APPEND) || + (flags & UF_SETTABLE) != flags) { + error = EPERM; + } else { + node->tn_flags &= SF_SETTABLE; + node->tn_flags |= (flags & UF_SETTABLE); + } + } + node->tn_status |= TMPFS_NODE_CHANGED; + TMPFS_NODE_UNLOCK(node); + } + + KKASSERT(vn_islocked(vp)); + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Change access mode on the given vnode. + * Caller should execute tmpfs_update on vp after a successful execution. + * The vnode must be locked on entry and remain locked on exit. + */ +int +tmpfs_chmod(struct vnode *vp, mode_t vamode, struct ucred *cred) +{ + struct tmpfs_node *node; + mode_t cur_mode; + int error; + + KKASSERT(vn_islocked(vp)); + + node = VP_TO_TMPFS_NODE(vp); + + /* Disallow this operation if the file system is mounted read-only. */ + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return EROFS; + + /* Immutable or append-only files cannot be modified, either. */ + if (node->tn_flags & (IMMUTABLE | APPEND)) + return EPERM; + + cur_mode = node->tn_mode; + error = vop_helper_chmod(vp, vamode, cred, node->tn_uid, node->tn_gid, + &cur_mode); + + if (error == 0 && + (node->tn_mode & ALLPERMS) != (cur_mode & ALLPERMS)) { + TMPFS_NODE_LOCK(node); + node->tn_mode &= ~ALLPERMS; + node->tn_mode |= cur_mode & ALLPERMS; + + node->tn_status |= TMPFS_NODE_CHANGED; + TMPFS_NODE_UNLOCK(node); + } + + KKASSERT(vn_islocked(vp)); + + return 0; +} + +/* --------------------------------------------------------------------- */ + +/* + * Change ownership of the given vnode. At least one of uid or gid must + * be different than VNOVAL. If one is set to that value, the attribute + * is unchanged. + * Caller should execute tmpfs_update on vp after a successful execution. + * The vnode must be locked on entry and remain locked on exit. + */ +int +tmpfs_chown(struct vnode *vp, uid_t uid, gid_t gid, struct ucred *cred) +{ + mode_t cur_mode; + uid_t cur_uid; + gid_t cur_gid; + struct tmpfs_node *node; + int error; + + KKASSERT(vn_islocked(vp)); + node = VP_TO_TMPFS_NODE(vp); + + /* Disallow this operation if the file system is mounted read-only. */ + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return EROFS; + + /* Immutable or append-only files cannot be modified, either. */ + if (node->tn_flags & (IMMUTABLE | APPEND)) + return EPERM; + + cur_uid = node->tn_uid; + cur_gid = node->tn_gid; + cur_mode = node->tn_mode; + error = vop_helper_chown(vp, uid, gid, cred, + &cur_uid, &cur_gid, &cur_mode); + + if (error == 0) { + TMPFS_NODE_LOCK(node); + if (cur_uid != node->tn_uid || + cur_gid != node->tn_gid || + cur_mode != node->tn_mode) { + node->tn_uid = cur_uid; + node->tn_gid = cur_gid; + node->tn_mode = cur_mode; + node->tn_status |= TMPFS_NODE_CHANGED; + } + TMPFS_NODE_UNLOCK(node); + } + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Change size of the given vnode. + * Caller should execute tmpfs_update on vp after a successful execution. + * The vnode must be locked on entry and remain locked on exit. + */ +int +tmpfs_chsize(struct vnode *vp, u_quad_t size, struct ucred *cred) +{ + int error; + struct tmpfs_node *node; + + KKASSERT(vn_islocked(vp)); + + node = VP_TO_TMPFS_NODE(vp); + + /* Decide whether this is a valid operation based on the file type. */ + error = 0; + switch (vp->v_type) { + case VDIR: + return EISDIR; + + case VREG: + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return EROFS; + break; + + case VBLK: + /* FALLTHROUGH */ + case VCHR: + /* FALLTHROUGH */ + case VFIFO: + /* Allow modifications of special files even if in the file + * system is mounted read-only (we are not modifying the + * files themselves, but the objects they represent). */ + return 0; + + default: + /* Anything else is unsupported. */ + return EOPNOTSUPP; + } + + /* Immutable or append-only files cannot be modified, either. */ + if (node->tn_flags & (IMMUTABLE | APPEND)) + return EPERM; + + error = tmpfs_truncate(vp, size); + /* tmpfs_truncate will raise the NOTE_EXTEND and NOTE_ATTRIB kevents + * for us, as will update tn_status; no need to do that here. */ + + KKASSERT(vn_islocked(vp)); + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * Change access and modification times of the given vnode. + * Caller should execute tmpfs_update on vp after a successful execution. + * The vnode must be locked on entry and remain locked on exit. + */ +int +tmpfs_chtimes(struct vnode *vp, struct timespec *atime, struct timespec *mtime, + int vaflags, struct ucred *cred) +{ + struct tmpfs_node *node; + + KKASSERT(vn_islocked(vp)); + + node = VP_TO_TMPFS_NODE(vp); + + /* Disallow this operation if the file system is mounted read-only. */ + if (vp->v_mount->mnt_flag & MNT_RDONLY) + return EROFS; + + /* Immutable or append-only files cannot be modified, either. */ + if (node->tn_flags & (IMMUTABLE | APPEND)) + return EPERM; + + TMPFS_NODE_LOCK(node); + if (atime->tv_sec != VNOVAL && atime->tv_nsec != VNOVAL) + node->tn_status |= TMPFS_NODE_ACCESSED; + + if (mtime->tv_sec != VNOVAL && mtime->tv_nsec != VNOVAL) + node->tn_status |= TMPFS_NODE_MODIFIED; + + TMPFS_NODE_UNLOCK(node); + + tmpfs_itimes(vp, atime, mtime); + + KKASSERT(vn_islocked(vp)); + + return 0; +} + +/* --------------------------------------------------------------------- */ +/* Sync timestamps */ +void +tmpfs_itimes(struct vnode *vp, const struct timespec *acc, + const struct timespec *mod) +{ + struct tmpfs_node *node; + struct timespec now; + + node = VP_TO_TMPFS_NODE(vp); + + if ((node->tn_status & (TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | + TMPFS_NODE_CHANGED)) == 0) + return; + + vfs_timestamp(&now); + + TMPFS_NODE_LOCK(node); + if (node->tn_status & TMPFS_NODE_ACCESSED) { + if (acc == NULL) + acc = &now; + node->tn_atime = acc->tv_sec; + node->tn_atimensec = acc->tv_nsec; + } + if (node->tn_status & TMPFS_NODE_MODIFIED) { + if (mod == NULL) + mod = &now; + node->tn_mtime = mod->tv_sec; + node->tn_mtimensec = mod->tv_nsec; + } + if (node->tn_status & TMPFS_NODE_CHANGED) { + node->tn_ctime = now.tv_sec; + node->tn_ctimensec = now.tv_nsec; + } + node->tn_status &= + ~(TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | TMPFS_NODE_CHANGED); + TMPFS_NODE_UNLOCK(node); +} + +/* --------------------------------------------------------------------- */ + +void +tmpfs_update(struct vnode *vp) +{ + + tmpfs_itimes(vp, NULL, NULL); +} + +/* --------------------------------------------------------------------- */ + +int +tmpfs_truncate(struct vnode *vp, off_t length) +{ + int error; + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + if (length < 0) { + error = EINVAL; + goto out; + } + + if (node->tn_size == length) { + error = 0; + goto out; + } + + if (length > VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) + return (EFBIG); + + + error = tmpfs_reg_resize(vp, length, 1); + + if (error == 0) { + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; + TMPFS_NODE_UNLOCK(node); + } + +out: + tmpfs_update(vp); + + return error; +} + +/* --------------------------------------------------------------------- */ + +static ino_t +tmpfs_fetch_ino(void) +{ + ino_t ret; + + spin_lock(&ino_lock); + ret = t_ino++; + spin_unlock(&ino_lock); + + return ret; +} diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c new file mode 100644 index 0000000000..e556c7703a --- /dev/null +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -0,0 +1,1008 @@ +/* + * Copyright (c) 2011, 2012 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/*- + * Copyright (c) 2005 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Julio M. Merino Vidal, developed as part of Google's Summer of Code + * 2005 program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hammer2.h" +#include "hammer2_disk.h" +#include "hammer2_mount.h" + +static int hammer2_init(struct vfsconf *conf); +static int hammer2_mount(struct mount *mp, char *path, caddr_t data, + struct ucred *cred); +static int hammer2_remount(struct mount *, char *, struct vnode *, + struct ucred *); +static int hammer2_unmount(struct mount *mp, int mntflags); +static int hammer2_root(struct mount *mp, struct vnode **vpp); +static int hammer2_statfs(struct mount *mp, struct statfs *sbp, + struct ucred *cred); +static int hammer2_statvfs(struct mount *mp, struct statvfs *sbp, + struct ucred *cred); +static int hammer2_sync(struct mount *mp, int waitfor); +static int hammer2_vget(struct mount *mp, struct vnode *dvp, + ino_t ino, struct vnode **vpp); +static int hammer2_fhtovp(struct mount *mp, struct vnode *rootvp, + struct fid *fhp, struct vnode **vpp); +static int hammer2_vptofh(struct vnode *vp, struct fid *fhp); +static int hammer2_checkexp(struct mount *mp, struct sockaddr *nam, + int *exflagsp, struct ucred **credanonp); + +static int tmpfs_unmount(struct mount *, int); +static int tmpfs_root(struct mount *, struct vnode **); + +/* + * HAMMER2 vfs operations. + */ +static struct vfsops hammer2_vfsops = { + /* From tmpfs */ + .vfs_root = tmpfs_root, + + /* From HAMMER2 */ + .vfs_init = hammer2_init, + .vfs_sync = hammer2_sync, + .vfs_mount = hammer2_mount, + .vfs_unmount = hammer2_unmount, +#ifdef notyet + .vfs_root = hammer2_root, +#endif + .vfs_statfs = hammer2_statfs, + /* If we enable statvfs, we disappear in df, till we implement it. */ + /* That makes debugging difficult :) */ +// .vfs_statvfs = hammer2_statvfs, + .vfs_vget = hammer2_vget, + .vfs_vptofh = hammer2_vptofh, + .vfs_fhtovp = hammer2_fhtovp, + .vfs_checkexp = hammer2_checkexp +}; + + +MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", ""); + +VFS_SET(hammer2_vfsops, hammer2, 0); +MODULE_VERSION(hammer2, 1); + +static int +hammer2_init(struct vfsconf *conf) +{ + int error; + + error = 0; + + if (HAMMER2_BLOCKREF_SIZE != sizeof(struct hammer2_blockref)) + error = EINVAL; + if (HAMMER2_INODE_SIZE != sizeof(struct hammer2_inode_data)) + error = EINVAL; + if (HAMMER2_ALLOCREF_SIZE != sizeof(struct hammer2_allocref)) + error = EINVAL; + if (HAMMER2_VOLUME_SIZE != sizeof(struct hammer2_volume_data)) + error = EINVAL; + + if (error) + kprintf("HAMMER2 structure size mismatch; cannot continue.\n"); + + return (error); +} + +/* + * Mount or remount HAMMER2 fileystem from physical media + * + * mountroot + * mp mount point structure + * path NULL + * data + * cred + * + * mount + * mp mount point structure + * path path to mount point + * data pointer to argument structure in user space + * volume volume path (device@LABEL form) + * hflags user mount flags + * cred user credentials + * + * RETURNS: 0 Success + * !0 error number + */ +static int +hammer2_mount(struct mount *mp, char *path, caddr_t data, + struct ucred *cred) +{ + struct hammer2_mount_info info; + struct hammer2_mount *hmp; + struct vnode *devvp; + struct nlookupdata nd; + char devstr[MNAMELEN]; + size_t size; + size_t done; + char *dev, *label; + int ronly; + int error; + int rc; + + hmp = NULL; + dev = label = NULL; + devvp = NULL; + + kprintf("hammer2_mount\n"); + + if (path == NULL) { + /* + * Root mount + */ + + return (EOPNOTSUPP); + } else { + /* + * Non-root mount or updating a mount + */ + + error = copyin(data, &info, sizeof(info)); + if (error) + return (error); + + error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done); + if (error) + return (error); + + /* Extract device and label */ + dev = devstr; + label = strchr(devstr, '@'); + if (label == NULL || + ((label + 1) - dev) > done) + return (EINVAL); + *label = '\0'; + label++; + if (*label == '\0') + return (EINVAL); + + if (mp->mnt_flag & MNT_UPDATE) { + /* Update mount */ + /* HAMMER2 implements NFS export via mountctl */ + hmp = MPTOH2(mp); + devvp = hmp->hm_devvp; + return hammer2_remount(mp, path, devvp, cred); + } + } + + /* + * New non-root mount + */ + /* Lookup name and verify it refers to a block device */ + error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW); + if (error) + return (error); + error = nlookup(&nd); + if (error) + return (error); + error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp); + if (error) + return (error); + nlookup_done(&nd); + + if (!vn_isdisk(devvp, &error)) { + vrele(devvp); + return (error); + } + + /* + * Common path for new root/non-root mounts; + * devvp is a ref-ed by not locked vnode referring to the fs device + */ + + error = vfs_mountedon(devvp); + if (error) { + vrele(devvp); + return (error); + } + + if (vcount(devvp) > 0) { + vrele(devvp); + return (EBUSY); + } + + /* + * Open the fs device + */ + ronly = (mp->mnt_flag & MNT_RDONLY) != 0; + vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY); + error = vinvalbuf(devvp, V_SAVE, 0, 0); + if (error) { + vn_unlock(devvp); + vrele(devvp); + return (error); + } + /* This is correct; however due to an NFS quirk of my setup, FREAD + * is required... */ + /* + error = VOP_OPEN(devvp, ronly ? FREAD : FREAD | FWRITE, FSCRED, NULL); + */ + error = VOP_OPEN(devvp, FREAD, FSCRED, NULL); + vn_unlock(devvp); + if (error) { + vrele(devvp); + return (error); + } + +#ifdef notyet + /* VOP_IOCTL(EXTENDED_DISK_INFO, devvp); */ + /* if vn device, never use bdwrite(); */ + /* check if device supports BUF_CMD_READALL; */ + /* check if device supports BUF_CMD_WRITEALL; */ +#endif + + hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO); + /*mp->mnt_data = (qaddr_t) hmp;*/ + hmp->hm_mp = mp; + /*hmp->hm_ronly = ronly;*/ + /*hmp->hm_devvp = devvp;*/ + lockinit(&hmp->hm_lk, "h2mp", 0, 0); + kmalloc_create(&hmp->hm_inodes, "HAMMER2-inodes"); + kmalloc_create(&hmp->hm_ipstacks, "HAMMER2-ipstacks"); + + /* Readout volume headers, make sure we have a live filesystem */ + /* Kinda hacky atm */ + { + struct buf *bps[HAMMER2_NUM_VOLHDRS]; + int valid = 0; + int hi_tid = 0; + int hi_num = 0; + int i; + uint32_t crc; + struct hammer2_volume_data *vd; + for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) { + rc = bread(devvp, i * HAMMER2_RESERVE_ALIGN64, + HAMMER2_BUFSIZE, &bps[i]); + if (rc != 0) { + brelse(bps[i]); + bps[i] = NULL; + continue; + } + + vd = bps[i]->b_data; + if (vd->magic == HAMMER2_VOLUME_ID_HBO) { + uint32_t ccrc; + unsigned char tmp[512]; + bcopy(bps[i]->b_data, &tmp, 512); + bzero(&tmp[512 - 4], 4); + /* Calculate CRC32 w/ crc field zero */ + /* XXX: Can we modify b_data? */ + ccrc = hammer2_icrc32(tmp, 512); + crc = vd->icrc_sect0; + + if (ccrc != crc) { + brelse(bps[i]); + bps[i] = NULL; + continue; + } + + valid++; + if (vd->last_tid > hi_tid) { + hi_tid = vd->last_tid; + hi_num = i; + } + } + } + if (valid) { + /* We have found the hammer volume header w/ + * the highest transaction id. Use it. */ + + bcopy(bps[hi_num]->b_data, &hmp->hm_sb, + HAMMER2_BUFSIZE); + + for (i = 0 ; i < HAMMER2_NUM_VOLHDRS; i++) + brelse(bps[i]); + + kprintf("HAMMER2 volume %d by\n", hmp->hm_sb.volu_size); + } else { + /* XXX More to do! Release structures and stuff */ + return (EINVAL); + } + } + + /* + * Filesystem subroutines are self-synchronized + */ + /*mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;*/ + + + /* Setup root inode */ + hmp->hm_iroot = alloci(hmp); + hmp->hm_iroot->type = HAMMER2_INODE_DIR | HAMMER2_INODE_ROOT; + hmp->hm_iroot->hi_inum = 1; + + /* currently rely on tmpfs routines */ + /*vfs_getnewfsid(mp);*/ + /*vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);*/ + /*vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);*/ + /*vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);*/ + + copystr("hammer2", mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size); + bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size); + bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname)); + copyinstr(path, mp->mnt_stat.f_mntonname, + sizeof(mp->mnt_stat.f_mntonname) - 1, + &size); + + hammer2_statfs(mp, &mp->mnt_stat, cred); + + hammer2_inode_unlock_ex(hmp->hm_iroot); + + return (tmpfs_mount(hmp, mp, path, data, cred)); +} + +static int +hammer2_remount(struct mount *mp, char *path, struct vnode *devvp, + struct ucred *cred) +{ + return (0); +} + +static int +hammer2_unmount(struct mount *mp, int mntflags) +{ + struct hammer2_mount *hmp; + int flags; + int error; + + kprintf("hammer2_unmount\n"); + + hmp = MPTOH2(mp); + flags = 0; + + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + + hammer2_mount_exlock(hmp); + + error = vflush(mp, 0, flags); + + /* + * Work to do: + * 1) Wait on the flusher having no work; heat up if needed + * 2) Scan inode RB tree till all the inodes are free + * 3) Destroy the kmalloc inode zone + * 4) Free the mount point + */ + + kmalloc_destroy(&hmp->hm_inodes); + kmalloc_destroy(&hmp->hm_ipstacks); + + hammer2_mount_unlock(hmp); + + // Tmpfs does this + //kfree(hmp, M_HAMMER2); + + return (tmpfs_unmount(mp, mntflags)); + + return (error); +} + +static int +hammer2_vget(struct mount *mp, struct vnode *dvp, + ino_t ino, struct vnode **vpp) +{ + kprintf("hammer2_vget\n"); + return (EOPNOTSUPP); +} + +static int +hammer2_root(struct mount *mp, struct vnode **vpp) +{ + struct hammer2_mount *hmp; + int error; + struct vnode *vp; + + kprintf("hammer2_root\n"); + + hmp = MPTOH2(mp); + hammer2_mount_lock_ex(hmp); + if (hmp->hm_iroot == NULL) { + *vpp = NULL; + error = EINVAL; + } else { + vp = igetv(hmp->hm_iroot, &error); + *vpp = vp; + if (vp == NULL) + kprintf("vnodefail\n"); + } + hammer2_mount_unlock(hmp); + + return (error); +} + +static int +hammer2_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred) +{ + struct hammer2_mount *hmp; + + kprintf("hammer2_statfs\n"); + + hmp = MPTOH2(mp); + + sbp->f_iosize = PAGE_SIZE; + sbp->f_bsize = PAGE_SIZE; + + sbp->f_blocks = 10; + sbp->f_bavail = 10; + sbp->f_bfree = 10; + + sbp->f_files = 10; + sbp->f_ffree = 10; + sbp->f_owner = 0; + + return (0); +} + +static int +hammer2_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred) +{ + kprintf("hammer2_statvfs\n"); + return (EOPNOTSUPP); +} + +/* + * Sync the entire filesystem; this is called from the filesystem syncer + * process periodically and whenever a user calls sync(1) on the hammer + * mountpoint. + * + * Currently is actually called from the syncer! \o/ + * + * This task will have to snapshot the state of the dirty inode chain. + * From that, it will have to make sure all of the inodes on the dirty + * chain have IO initiated. We make sure that io is initiated for the root + * block. + * + * If waitfor is set, we wait for media to acknowledge the new rootblock. + * + * THINKS: side A vs side B, to have sync not stall all I/O? + */ +static int +hammer2_sync(struct mount *mp, int waitfor) +{ + struct hammer2_mount *hmp; + struct hammer2_inode *ip; + + kprintf("hammer2_sync \n"); + +// hmp = MPTOH2(mp); + + return (0); +} + +static int +hammer2_vptofh(struct vnode *vp, struct fid *fhp) +{ + return (0); +} + +static int +hammer2_fhtovp(struct mount *mp, struct vnode *rootvp, + struct fid *fhp, struct vnode **vpp) +{ + return (0); +} + +static int +hammer2_checkexp(struct mount *mp, struct sockaddr *nam, + int *exflagsp, struct ucred **credanonp) +{ + return (0); +} + +/* + * Efficient memory file system. + * + * tmpfs is a file system that uses NetBSD's virtual memory sub-system + * (the well-known UVM) to store file data and metadata in an efficient + * way. This means that it does not follow the structure of an on-disk + * file system because it simply does not need to. Instead, it uses + * memory-specific data structures and algorithms to automatically + * allocate and release resources. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "hammer2.h" + +/* + * Default permission for root node + */ +#define TMPFS_DEFAULT_ROOT_MODE (S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH) + +/* --------------------------------------------------------------------- */ +int +tmpfs_node_ctor(void *obj, void *privdata, int flags) +{ + struct tmpfs_node *node = (struct tmpfs_node *)obj; + + node->tn_gen++; + node->tn_size = 0; + node->tn_status = 0; + node->tn_flags = 0; + node->tn_links = 0; + node->tn_vnode = NULL; + node->tn_vpstate = TMPFS_VNODE_WANT; + bzero(&node->tn_spec, sizeof(node->tn_spec)); + + return (1); +} + +static void +tmpfs_node_dtor(void *obj, void *privdata) +{ + struct tmpfs_node *node = (struct tmpfs_node *)obj; + node->tn_type = VNON; + node->tn_vpstate = TMPFS_VNODE_DOOMED; +} + +static void* +tmpfs_node_init(void *args, int flags) +{ + struct tmpfs_node *node = (struct tmpfs_node *)objcache_malloc_alloc(args, flags); + if (node == NULL) + return (NULL); + node->tn_id = 0; + + lockinit(&node->tn_interlock, "tmpfs node interlock", 0, LK_CANRECURSE); + node->tn_gen = karc4random(); + + return node; +} + +static void +tmpfs_node_fini(void *obj, void *args) +{ + struct tmpfs_node *node = (struct tmpfs_node *)obj; + lockuninit(&node->tn_interlock); + objcache_malloc_free(obj, args); +} + +int +tmpfs_mount(struct hammer2_mount *hmp, + struct mount *mp, char *path, caddr_t data, struct ucred *cred) +{ +// struct tmpfs_mount *tmp; + struct tmpfs_node *root; +// struct tmpfs_args args; + vm_pindex_t pages; + vm_pindex_t pages_limit; + ino_t nodes; + u_int64_t maxfsize; + int error; + /* Size counters. */ + ino_t nodes_max; + off_t size_max; + size_t maxfsize_max; + size_t size; + + /* Root node attributes. */ + uid_t root_uid = cred->cr_uid; + gid_t root_gid = cred->cr_gid; + mode_t root_mode = (VREAD | VWRITE); + + if (mp->mnt_flag & MNT_UPDATE) { + /* XXX: There is no support yet to update file system + * settings. Should be added. */ + + return EOPNOTSUPP; + } + + kprintf("tmpfs_mount\n"); + + /* + * mount info + */ +// bzero(&args, sizeof(args)); + size_max = 0; + nodes_max = 0; + maxfsize_max = 0; + + if (path) { + if (data) { +// error = copyin(data, &args, sizeof(args)); +// if (error) +// return (error); + } + /* + size_max = args.ta_size_max; + nodes_max = args.ta_nodes_max; + maxfsize_max = args.ta_maxfsize_max; + root_uid = args.ta_root_uid; + root_gid = args.ta_root_gid; + root_mode = args.ta_root_mode; + */ + } + + /* + * If mount by non-root, then verify that user has necessary + * permissions on the device. + */ + if (cred->cr_uid != 0) { + root_mode = VREAD; + if ((mp->mnt_flag & MNT_RDONLY) == 0) + root_mode |= VWRITE; + } + + pages_limit = vm_swap_max + vmstats.v_page_count / 2; + + if (size_max == 0) + pages = pages_limit / 2; + else if (size_max < PAGE_SIZE) + pages = 1; + else if (OFF_TO_IDX(size_max) > pages_limit) + pages = pages_limit; + else + pages = OFF_TO_IDX(size_max); + + if (nodes_max == 0) + nodes = 3 + pages * PAGE_SIZE / 1024; + else if (nodes_max < 3) + nodes = 3; + else if (nodes_max > pages) + nodes = pages; + else + nodes = nodes_max; + + maxfsize = IDX_TO_OFF(pages_limit); + if (maxfsize_max != 0 && maxfsize > maxfsize_max) + maxfsize = maxfsize_max; + + /* Allocate the tmpfs mount structure and fill it. */ +// tmp = kmalloc(sizeof(*tmp), M_HAMMER2, M_WAITOK | M_ZERO); + + struct hammer2_mount *tmp = hmp; + lockinit(&(tmp->allnode_lock), "tmpfs allnode lock", 0, LK_CANRECURSE); + tmp->tm_nodes_max = nodes; + tmp->tm_nodes_inuse = 0; + tmp->tm_maxfilesize = maxfsize; + LIST_INIT(&tmp->tm_nodes_used); + + tmp->tm_pages_max = pages; + tmp->tm_pages_used = 0; + + kmalloc_create(&tmp->tm_node_zone, "tmpfs node"); + kmalloc_create(&tmp->tm_dirent_zone, "tmpfs dirent"); + kmalloc_create(&tmp->tm_name_zone, "tmpfs name zone"); + + kmalloc_raise_limit(tmp->tm_node_zone, sizeof(struct tmpfs_node) * + tmp->tm_nodes_max); + + tmp->tm_node_zone_malloc_args.objsize = sizeof(struct tmpfs_node); + tmp->tm_node_zone_malloc_args.mtype = tmp->tm_node_zone; + + tmp->tm_dirent_zone_malloc_args.objsize = sizeof(struct tmpfs_dirent); + tmp->tm_dirent_zone_malloc_args.mtype = tmp->tm_dirent_zone; + + tmp->tm_dirent_pool = objcache_create( "tmpfs dirent cache", + 0, 0, + NULL, NULL, NULL, + objcache_malloc_alloc, objcache_malloc_free, + &tmp->tm_dirent_zone_malloc_args); + tmp->tm_node_pool = objcache_create( "tmpfs node cache", + 0, 0, + tmpfs_node_ctor, tmpfs_node_dtor, NULL, + tmpfs_node_init, tmpfs_node_fini, + &tmp->tm_node_zone_malloc_args); + + /* Allocate the root node. */ + error = tmpfs_alloc_node(tmp, VDIR, root_uid, root_gid, + root_mode & ALLPERMS, NULL, NULL, + VNOVAL, VNOVAL, &root); + + /* + * We are backed by swap, set snocache chflags flag so we + * don't trip over swapcache. + */ + root->tn_flags = SF_NOCACHE; + + if (error != 0 || root == NULL) { + objcache_destroy(tmp->tm_node_pool); + objcache_destroy(tmp->tm_dirent_pool); + kfree(tmp, M_HAMMER2); + return error; + } + KASSERT(root->tn_id >= 0, ("tmpfs root with invalid ino: %d", (int)root->tn_id)); + tmp->tm_root = root; + + mp->mnt_flag |= MNT_LOCAL; +#if 0 + mp->mnt_kern_flag |= MNTK_RD_MPSAFE | MNTK_WR_MPSAFE | MNTK_GA_MPSAFE | + MNTK_IN_MPSAFE | MNTK_SG_MPSAFE; +#endif + mp->mnt_kern_flag |= MNTK_RD_MPSAFE | MNTK_GA_MPSAFE | MNTK_SG_MPSAFE; + mp->mnt_kern_flag |= MNTK_WR_MPSAFE; + mp->mnt_kern_flag |= MNTK_NOMSYNC; + mp->mnt_kern_flag |= MNTK_THR_SYNC; + mp->mnt_data = (qaddr_t)tmp; + vfs_getnewfsid(mp); + + vfs_add_vnodeops(mp, &tmpfs_vnode_vops, &mp->mnt_vn_norm_ops); + vfs_add_vnodeops(mp, &tmpfs_fifo_vops, &mp->mnt_vn_fifo_ops); + + hammer2_statfs(mp, &mp->mnt_stat, cred); + + return 0; +} + +/* --------------------------------------------------------------------- */ + +/* ARGSUSED2 */ +static int +tmpfs_unmount(struct mount *mp, int mntflags) +{ + int error; + int flags = 0; + int found; + struct hammer2_mount *tmp; + struct tmpfs_node *node; + + kprintf("tmpfs_umount\n"); + + /* Handle forced unmounts. */ + if (mntflags & MNT_FORCE) + flags |= FORCECLOSE; + + tmp = VFS_TO_TMPFS(mp); + + /* + * Finalize all pending I/O. In the case of tmpfs we want + * to throw all the data away so clean out the buffer cache + * and vm objects before calling vflush(). + */ + LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) { + if (node->tn_type == VREG && node->tn_vnode) { + ++node->tn_links; + TMPFS_NODE_LOCK(node); + vx_get(node->tn_vnode); + tmpfs_truncate(node->tn_vnode, 0); + vx_put(node->tn_vnode); + TMPFS_NODE_UNLOCK(node); + --node->tn_links; + } + } + error = vflush(mp, 0, flags); + if (error != 0) + return error; + + /* + * First pass get rid of all the directory entries and + * vnode associations. The directory structure will + * remain via the extra link count representing tn_dir.tn_parent. + * + * No vnodes should remain after the vflush above. + */ + LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) { + ++node->tn_links; + TMPFS_NODE_LOCK(node); + if (node->tn_type == VDIR) { + struct tmpfs_dirent *de; + + while (!TAILQ_EMPTY(&node->tn_dir.tn_dirhead)) { + de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); + tmpfs_dir_detach(node, de); + tmpfs_free_dirent(tmp, de); + node->tn_size -= sizeof(struct tmpfs_dirent); + } + } + KKASSERT(node->tn_vnode == NULL); +#if 0 + vp = node->tn_vnode; + if (vp != NULL) { + tmpfs_free_vp(vp); + vrecycle(vp); + node->tn_vnode = NULL; + } +#endif + TMPFS_NODE_UNLOCK(node); + --node->tn_links; + } + + /* + * Now get rid of all nodes. We can remove any node with a + * link count of 0 or any directory node with a link count of + * 1. The parents will not be destroyed until all their children + * have been destroyed. + * + * Recursion in tmpfs_free_node() can further modify the list so + * we cannot use a next pointer here. + * + * The root node will be destroyed by this loop (it will be last). + */ + while (!LIST_EMPTY(&tmp->tm_nodes_used)) { + found = 0; + LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) { + if (node->tn_links == 0 || + (node->tn_links == 1 && node->tn_type == VDIR)) { + TMPFS_NODE_LOCK(node); + tmpfs_free_node(tmp, node); + /* eats lock */ + found = 1; + break; + } + } + if (found == 0) { + kprintf("tmpfs: Cannot free entire node tree!"); + break; + } + } + + KKASSERT(tmp->tm_root == NULL); + + objcache_destroy(tmp->tm_dirent_pool); + objcache_destroy(tmp->tm_node_pool); + + kmalloc_destroy(&tmp->tm_name_zone); + kmalloc_destroy(&tmp->tm_dirent_zone); + kmalloc_destroy(&tmp->tm_node_zone); + + tmp->tm_node_zone = tmp->tm_dirent_zone = NULL; + + lockuninit(&tmp->allnode_lock); + KKASSERT(tmp->tm_pages_used == 0); + KKASSERT(tmp->tm_nodes_inuse == 0); + + /* Throw away the hammer2_mount structure. */ + kfree(tmp, M_HAMMER2); + mp->mnt_data = NULL; + + mp->mnt_flag &= ~MNT_LOCAL; + return 0; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_root(struct mount *mp, struct vnode **vpp) +{ + struct hammer2_mount *tmp; + int error; + + kprintf("tmpfs_root\n"); + + tmp = VFS_TO_TMPFS(mp); + if (tmp->tm_root == NULL) { + kprintf("tmpfs_root: called without root node %p\n", mp); + print_backtrace(-1); + *vpp = NULL; + error = EINVAL; + } else { + error = tmpfs_alloc_vp(mp, tmp->tm_root, LK_EXCLUSIVE, vpp); + (*vpp)->v_flag |= VROOT; + (*vpp)->v_type = VDIR; + } + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_fhtovp(struct mount *mp, struct vnode *rootvp, struct fid *fhp, struct vnode **vpp) +{ + boolean_t found; + struct tmpfs_fid *tfhp; + struct hammer2_mount *tmp; + struct tmpfs_node *node; + + tmp = VFS_TO_TMPFS(mp); + + tfhp = (struct tmpfs_fid *)fhp; + if (tfhp->tf_len != sizeof(struct tmpfs_fid)) + return EINVAL; + + if (tfhp->tf_id >= tmp->tm_nodes_max) + return EINVAL; + + found = FALSE; + + TMPFS_LOCK(tmp); + LIST_FOREACH(node, &tmp->tm_nodes_used, tn_entries) { + if (node->tn_id == tfhp->tf_id && + node->tn_gen == tfhp->tf_gen) { + found = TRUE; + break; + } + } + TMPFS_UNLOCK(tmp); + + if (found) + return (tmpfs_alloc_vp(mp, node, LK_EXCLUSIVE, vpp)); + + return (EINVAL); +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_vptofh(struct vnode *vp, struct fid *fhp) +{ + struct tmpfs_node *node; + struct tmpfs_fid tfh; + node = VP_TO_TMPFS_NODE(vp); + memset(&tfh, 0, sizeof(tfh)); + tfh.tf_len = sizeof(struct tmpfs_fid); + tfh.tf_gen = node->tn_gen; + tfh.tf_id = node->tn_id; + memcpy(fhp, &tfh, sizeof(tfh)); + return (0); +} diff --git a/sys/vfs/hammer2/hammer2_vnops.c b/sys/vfs/hammer2/hammer2_vnops.c new file mode 100644 index 0000000000..b257f3f82b --- /dev/null +++ b/sys/vfs/hammer2/hammer2_vnops.c @@ -0,0 +1,2012 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hammer2.h" + + +/* + * Last reference to a vnode is going away but it is still cached. + */ +int +hammer2_inactive(struct vop_inactive_args *ap) +{ + struct vnode *vp; + struct hammer2_inode *ip; + struct hammer2_mount *hmp; + + kprintf("hammer2_inactive\n"); + + vp = ap->a_vp; + ip = VTOI(vp); + // hmp = ip->hi_mp; + + return (0); +} + +/* + * Reclaim a vnode so that it can be reused; after the inode is + * disassociated, the filesystem must manage it alone. + */ +int +hammer2_reclaim(struct vop_reclaim_args *ap) +{ + struct vnode *vp; + struct hammer2_inode *ip; + struct hammer2_mount *hmp; + + kprintf("hammer2_reclaim\n"); + + /* Is the vnode locked? Must it be on exit? */ + + vp = ap->a_vp; + ip = VTOI(vp); + hmp = ip->hi_mp; + + vp->v_data = NULL; + ip->hi_vnode = NULL; + + return (0); +} + + +int +hammer2_fsync(struct vop_fsync_args *ap) +{ + kprintf("hammer2_fsync\n"); + return (EOPNOTSUPP); +} + +int +hammer2_access(struct vop_access_args *ap) +{ + kprintf("hammer2_access\n"); + return (0); +} + +int +hammer2_getattr(struct vop_getattr_args *ap) +{ + struct vnode *vp; + struct vattr *vap; + struct hammer2_inode *ip; + + vp = ap->a_vp; + vap = ap->a_vap; + + kprintf("hammer2_getattr\n"); + + ip = VTOI(vp); + hammer2_inode_lock_sh(ip); + + vap->va_type = vp->v_type; + vap->va_mode = 0777; + vap->va_nlink = 1; + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_size = 0; + vap->va_blocksize = PAGE_SIZE; + vap->va_flags = 0; + + hammer2_inode_unlock_sh(ip); + + return (0); +} + +int +hammer2_readdir(struct vop_readdir_args *ap) +{ + kprintf("hammer2_readdir\n"); + return (EOPNOTSUPP); +} + +int +hammer2_read(struct vop_read_args *ap) +{ + +} + +int +hammer2_nresolve(struct vop_nresolve_args *ap) +{ + kprintf("hammer2_nresolve\n"); + return EOPNOTSUPP; +} + +int +hammer2_bmap(struct vop_bmap_args *ap) +{ + kprintf("hammer2_bmap\n"); + return (EOPNOTSUPP); +} + +int +hammer2_open(struct vop_open_args *ap) +{ + kprintf("hammer2_open\n"); + return vop_stdopen(ap); +} + +int +hammer2_strategy(struct vop_strategy_args *ap) +{ + struct vnode *vp; + struct bio *biop; + struct buf *bp; + struct hammer2_inode *ip; + int error; + + vp = ap->a_vp; + biop = ap->a_bio; + bp = biop->bio_buf; + ip = VTOI(vp); + + switch(bp->b_cmd) { + case (BUF_CMD_READ): + case (BUF_CMD_WRITE): + default: + bp->b_error = error = EINVAL; + bp->b_flags |= B_ERROR; + biodone(biop); + break; + } + + return (error); +} + +struct vop_ops hammer2_vnode_vops = { + .vop_default = vop_defaultop, + .vop_fsync = hammer2_fsync, + .vop_getpages = vop_stdgetpages, + .vop_putpages = vop_stdputpages, + .vop_access = hammer2_access, + .vop_getattr = hammer2_getattr, +//// .vop_readdir = hammer2_readdir, +// .vop_read = hammer2_read, +// .vop_write = hammer2_write, + .vop_open = hammer2_open, + .vop_inactive = hammer2_inactive, + .vop_reclaim = hammer2_reclaim, + .vop_nresolve = hammer2_nresolve, + +// .vop_bmap = hammer2_bmap, +// .vop_strategy = hammer2_strategy, +}; + +struct vop_ops hammer2_spec_vops = { + +}; + +struct vop_ops hammer2_fifo_vops = { + +}; +/*- + * Copyright (c) 2005, 2006 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Julio M. Merino Vidal, developed as part of Google's Summer of Code + * 2005 program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $NetBSD: tmpfs_vnops.c,v 1.39 2007/07/23 15:41:01 jmmv Exp $ + */ + +/* + * tmpfs vnode interface. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include "hammer2.h" + +MALLOC_DECLARE(M_TMPFS); + +static void tmpfs_strategy_done(struct bio *bio); + +static __inline +void +tmpfs_knote(struct vnode *vp, int flags) +{ + if (flags) + KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags); +} + + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_nresolve(struct vop_nresolve_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct vnode *vp = NULL; + struct namecache *ncp = v->a_nch->ncp; + struct tmpfs_node *tnode; + + int error; + struct tmpfs_dirent *de; + struct tmpfs_node *dnode; + + dnode = VP_TO_TMPFS_DIR(dvp); + + de = tmpfs_dir_lookup(dnode, NULL, ncp); + if (de == NULL) { + error = ENOENT; + } else { + /* + * Allocate a vnode for the node we found. + */ + tnode = de->td_node; + error = tmpfs_alloc_vp(dvp->v_mount, tnode, + LK_EXCLUSIVE | LK_RETRY, &vp); + if (error) + goto out; + KKASSERT(vp); + } + +out: + /* + * Store the result of this lookup in the cache. Avoid this if the + * request was for creation, as it does not improve timings on + * emprical tests. + */ + if (vp) { + vn_unlock(vp); + cache_setvp(v->a_nch, vp); + vrele(vp); + } else if (error == ENOENT) { + cache_setvp(v->a_nch, NULL); + } + return error; +} + +static int +tmpfs_nlookupdotdot(struct vop_nlookupdotdot_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct vnode **vpp = v->a_vpp; + struct tmpfs_node *dnode = VP_TO_TMPFS_NODE(dvp); + struct ucred *cred = v->a_cred; + int error; + + *vpp = NULL; + /* Check accessibility of requested node as a first step. */ + error = VOP_ACCESS(dvp, VEXEC, cred); + if (error != 0) + return error; + + if (dnode->tn_dir.tn_parent != NULL) { + /* Allocate a new vnode on the matching entry. */ + error = tmpfs_alloc_vp(dvp->v_mount, dnode->tn_dir.tn_parent, + LK_EXCLUSIVE | LK_RETRY, vpp); + + if (*vpp) + vn_unlock(*vpp); + } + + return (*vpp == NULL) ? ENOENT : 0; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_ncreate(struct vop_ncreate_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct vnode **vpp = v->a_vpp; + struct namecache *ncp = v->a_nch->ncp; + struct vattr *vap = v->a_vap; + struct ucred *cred = v->a_cred; + int error; + + KKASSERT(vap->va_type == VREG || vap->va_type == VSOCK); + + error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL); + if (error == 0) { + cache_setunresolved(v->a_nch); + cache_setvp(v->a_nch, *vpp); + tmpfs_knote(dvp, NOTE_WRITE); + } + + return error; +} +/* --------------------------------------------------------------------- */ + +static int +tmpfs_nmknod(struct vop_nmknod_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct vnode **vpp = v->a_vpp; + struct namecache *ncp = v->a_nch->ncp; + struct vattr *vap = v->a_vap; + struct ucred *cred = v->a_cred; + int error; + + if (vap->va_type != VBLK && vap->va_type != VCHR && + vap->va_type != VFIFO) + return EINVAL; + + error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL); + if (error == 0) { + cache_setunresolved(v->a_nch); + cache_setvp(v->a_nch, *vpp); + tmpfs_knote(dvp, NOTE_WRITE); + } + + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_open(struct vop_open_args *v) +{ + struct vnode *vp = v->a_vp; + int mode = v->a_mode; + + int error; + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + /* The file is still active but all its names have been removed + * (e.g. by a "rmdir $(pwd)"). It cannot be opened any more as + * it is about to die. */ + if (node->tn_links < 1) + return (ENOENT); + + /* If the file is marked append-only, deny write requests. */ + if ((node->tn_flags & APPEND) && + (mode & (FWRITE | O_APPEND)) == FWRITE) { + error = EPERM; + } else { + return (vop_stdopen(v)); + } + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_close(struct vop_close_args *v) +{ + struct vnode *vp = v->a_vp; + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + if (node->tn_links > 0) { + /* Update node times. No need to do it if the node has + * been deleted, because it will vanish after we return. */ + tmpfs_update(vp); + } + + return vop_stdclose(v); +} + +/* --------------------------------------------------------------------- */ + +int +tmpfs_access(struct vop_access_args *v) +{ + struct vnode *vp = v->a_vp; + int error; + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + switch (vp->v_type) { + case VDIR: + /* FALLTHROUGH */ + case VLNK: + /* FALLTHROUGH */ + case VREG: + if ((v->a_mode & VWRITE) && (vp->v_mount->mnt_flag & MNT_RDONLY)) { + error = EROFS; + goto out; + } + break; + + case VBLK: + /* FALLTHROUGH */ + case VCHR: + /* FALLTHROUGH */ + case VSOCK: + /* FALLTHROUGH */ + case VFIFO: + break; + + default: + error = EINVAL; + goto out; + } + + if ((v->a_mode & VWRITE) && (node->tn_flags & IMMUTABLE)) { + error = EPERM; + goto out; + } + + error = vop_helper_access(v, node->tn_uid, node->tn_gid, node->tn_mode, 0); + +out: + + return error; +} + +/* --------------------------------------------------------------------- */ + +int +tmpfs_getattr(struct vop_getattr_args *v) +{ + struct vnode *vp = v->a_vp; + struct vattr *vap = v->a_vap; + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + lwkt_gettoken(&vp->v_mount->mnt_token); + tmpfs_update(vp); + + vap->va_type = vp->v_type; + vap->va_mode = node->tn_mode; + vap->va_nlink = node->tn_links; + vap->va_uid = node->tn_uid; + vap->va_gid = node->tn_gid; + vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0]; + vap->va_fileid = node->tn_id; + vap->va_size = node->tn_size; + vap->va_blocksize = PAGE_SIZE; + vap->va_atime.tv_sec = node->tn_atime; + vap->va_atime.tv_nsec = node->tn_atimensec; + vap->va_mtime.tv_sec = node->tn_mtime; + vap->va_mtime.tv_nsec = node->tn_mtimensec; + vap->va_ctime.tv_sec = node->tn_ctime; + vap->va_ctime.tv_nsec = node->tn_ctimensec; + vap->va_gen = node->tn_gen; + vap->va_flags = node->tn_flags; + if (vp->v_type == VBLK || vp->v_type == VCHR) + { + vap->va_rmajor = umajor(node->tn_rdev); + vap->va_rminor = uminor(node->tn_rdev); + } + vap->va_bytes = round_page(node->tn_size); + vap->va_filerev = 0; + + lwkt_reltoken(&vp->v_mount->mnt_token); + + return 0; +} + +/* --------------------------------------------------------------------- */ + +int +tmpfs_setattr(struct vop_setattr_args *v) +{ + struct vnode *vp = v->a_vp; + struct vattr *vap = v->a_vap; + struct ucred *cred = v->a_cred; + struct tmpfs_node *node = VP_TO_TMPFS_NODE(vp); + int error = 0; + int kflags = 0; + + if (error == 0 && (vap->va_flags != VNOVAL)) { + error = tmpfs_chflags(vp, vap->va_flags, cred); + kflags |= NOTE_ATTRIB; + } + + if (error == 0 && (vap->va_size != VNOVAL)) { + if (vap->va_size > node->tn_size) + kflags |= NOTE_WRITE | NOTE_EXTEND; + else + kflags |= NOTE_WRITE; + error = tmpfs_chsize(vp, vap->va_size, cred); + } + + if (error == 0 && (vap->va_uid != (uid_t)VNOVAL || + vap->va_gid != (gid_t)VNOVAL)) { + error = tmpfs_chown(vp, vap->va_uid, vap->va_gid, cred); + kflags |= NOTE_ATTRIB; + } + + if (error == 0 && (vap->va_mode != (mode_t)VNOVAL)) { + error = tmpfs_chmod(vp, vap->va_mode, cred); + kflags |= NOTE_ATTRIB; + } + + if (error == 0 && ((vap->va_atime.tv_sec != VNOVAL && + vap->va_atime.tv_nsec != VNOVAL) || + (vap->va_mtime.tv_sec != VNOVAL && + vap->va_mtime.tv_nsec != VNOVAL) )) { + error = tmpfs_chtimes(vp, &vap->va_atime, &vap->va_mtime, + vap->va_vaflags, cred); + kflags |= NOTE_ATTRIB; + } + + /* Update the node times. We give preference to the error codes + * generated by this function rather than the ones that may arise + * from tmpfs_update. */ + tmpfs_update(vp); + tmpfs_knote(vp, kflags); + + return error; +} + +/* --------------------------------------------------------------------- */ + +/* + * fsync is usually a NOP, but we must take action when unmounting or + * when recycling. + */ +static int +tmpfs_fsync(struct vop_fsync_args *v) +{ + struct hammer2_mount *tmp; + struct tmpfs_node *node; + struct vnode *vp = v->a_vp; + + tmp = VFS_TO_TMPFS(vp->v_mount); + node = VP_TO_TMPFS_NODE(vp); + + tmpfs_update(vp); + if (vp->v_type == VREG) { + if (vp->v_flag & VRECLAIMED) { + if (node->tn_links == 0) + tmpfs_truncate(vp, 0); + else + vfsync(v->a_vp, v->a_waitfor, 1, NULL, NULL); + } + } + return 0; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_read (struct vop_read_args *ap) +{ + struct buf *bp; + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct tmpfs_node *node; + off_t base_offset; + size_t offset; + size_t len; + int error; + + error = 0; + if (uio->uio_resid == 0) { + return error; + } + + node = VP_TO_TMPFS_NODE(vp); + + if (uio->uio_offset < 0) + return (EINVAL); + if (vp->v_type != VREG) + return (EINVAL); + + while (uio->uio_resid > 0 && uio->uio_offset < node->tn_size) { + /* + * Use buffer cache I/O (via tmpfs_strategy) + */ + offset = (size_t)uio->uio_offset & BMASK; + base_offset = (off_t)uio->uio_offset - offset; + bp = getcacheblk(vp, base_offset, BSIZE); + if (bp == NULL) + { + lwkt_gettoken(&vp->v_mount->mnt_token); + error = bread(vp, base_offset, BSIZE, &bp); + if (error) { + brelse(bp); + lwkt_reltoken(&vp->v_mount->mnt_token); + kprintf("tmpfs_read bread error %d\n", error); + break; + } + lwkt_reltoken(&vp->v_mount->mnt_token); + } + + /* + * Figure out how many bytes we can actually copy this loop. + */ + len = BSIZE - offset; + if (len > uio->uio_resid) + len = uio->uio_resid; + if (len > node->tn_size - uio->uio_offset) + len = (size_t)(node->tn_size - uio->uio_offset); + + error = uiomove((char *)bp->b_data + offset, len, uio); + bqrelse(bp); + if (error) { + kprintf("tmpfs_read uiomove error %d\n", error); + break; + } + } + + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_ACCESSED; + TMPFS_NODE_UNLOCK(node); + + return(error); +} + +static int +hammer2_write(struct vop_write_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + int rc; + + + return (rc); +} + +static int +tmpfs_write (struct vop_write_args *ap) +{ + struct buf *bp; + struct vnode *vp = ap->a_vp; + struct uio *uio = ap->a_uio; + struct thread *td = uio->uio_td; + struct tmpfs_node *node; + boolean_t extended; + off_t oldsize; + int error; + off_t base_offset; + size_t offset; + size_t len; + struct rlimit limit; + int trivial = 0; + int kflags = 0; + + error = 0; + if (uio->uio_resid == 0) { + return error; + } + + node = VP_TO_TMPFS_NODE(vp); + + if (vp->v_type != VREG) + return (EINVAL); + + lwkt_gettoken(&vp->v_mount->mnt_token); + + oldsize = node->tn_size; + if (ap->a_ioflag & IO_APPEND) + uio->uio_offset = node->tn_size; + + /* + * Check for illegal write offsets. + */ + if (uio->uio_offset + uio->uio_resid > + VFS_TO_TMPFS(vp->v_mount)->tm_maxfilesize) { + lwkt_reltoken(&vp->v_mount->mnt_token); + return (EFBIG); + } + + if (vp->v_type == VREG && td != NULL) { + error = kern_getrlimit(RLIMIT_FSIZE, &limit); + if (error != 0) { + lwkt_reltoken(&vp->v_mount->mnt_token); + return error; + } + if (uio->uio_offset + uio->uio_resid > limit.rlim_cur) { + ksignal(td->td_proc, SIGXFSZ); + lwkt_reltoken(&vp->v_mount->mnt_token); + return (EFBIG); + } + } + + + /* + * Extend the file's size if necessary + */ + extended = ((uio->uio_offset + uio->uio_resid) > node->tn_size); + + while (uio->uio_resid > 0) { + /* + * Use buffer cache I/O (via tmpfs_strategy) + */ + offset = (size_t)uio->uio_offset & BMASK; + base_offset = (off_t)uio->uio_offset - offset; + len = BSIZE - offset; + if (len > uio->uio_resid) + len = uio->uio_resid; + + if ((uio->uio_offset + len) > node->tn_size) { + trivial = (uio->uio_offset <= node->tn_size); + error = tmpfs_reg_resize(vp, uio->uio_offset + len, trivial); + if (error) + break; + } + + /* + * Read to fill in any gaps. Theoretically we could + * optimize this if the write covers the entire buffer + * and is not a UIO_NOCOPY write, however this can lead + * to a security violation exposing random kernel memory + * (whatever junk was in the backing VM pages before). + * + * So just use bread() to do the right thing. + */ + error = bread(vp, base_offset, BSIZE, &bp); + error = uiomove((char *)bp->b_data + offset, len, uio); + if (error) { + kprintf("tmpfs_write uiomove error %d\n", error); + brelse(bp); + break; + } + + if (uio->uio_offset > node->tn_size) { + node->tn_size = uio->uio_offset; + kflags |= NOTE_EXTEND; + } + kflags |= NOTE_WRITE; + + /* + * The data has been loaded into the buffer, write it out. + * + * We want tmpfs to be able to use all available ram, not + * just the buffer cache, so if not explicitly paging we + * use buwrite() to leave the buffer clean but mark all the + * VM pages valid+dirty. + * + * When the kernel is paging, either via normal pageout + * operation or when cleaning the object during a recycle, + * the underlying VM pages are going to get thrown away + * so we MUST write them to swap. + * + * XXX unfortunately this catches msync() system calls too + * for the moment. + */ + if (vm_swap_size == 0) { + /* + * if swap isn't configured yet, force a buwrite() to + * avoid problems further down the line, due to flushing + * to swap. + */ + buwrite(bp); + } else { + if (ap->a_ioflag & IO_SYNC) { + bwrite(bp); + } else if ((ap->a_ioflag & IO_ASYNC) || + (uio->uio_segflg == UIO_NOCOPY)) { + bawrite(bp); + } else { + buwrite(bp); + } + } + + if (bp->b_error) { + kprintf("tmpfs_write bwrite error %d\n", bp->b_error); + break; + } + } + + if (error) { + if (extended) { + (void)tmpfs_reg_resize(vp, oldsize, trivial); + kflags &= ~NOTE_EXTEND; + } + goto done; + } + + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_MODIFIED | + (extended? TMPFS_NODE_CHANGED : 0); + + if (node->tn_mode & (S_ISUID | S_ISGID)) { + if (priv_check_cred(ap->a_cred, PRIV_VFS_RETAINSUGID, 0)) + node->tn_mode &= ~(S_ISUID | S_ISGID); + } + TMPFS_NODE_UNLOCK(node); +done: + + tmpfs_knote(vp, kflags); + + + lwkt_reltoken(&vp->v_mount->mnt_token); + return(error); +} + +static int +tmpfs_advlock (struct vop_advlock_args *ap) +{ + struct tmpfs_node *node; + struct vnode *vp = ap->a_vp; + + node = VP_TO_TMPFS_NODE(vp); + + return (lf_advlock(ap, &node->tn_advlock, node->tn_size)); +} + +static int +tmpfs_strategy(struct vop_strategy_args *ap) +{ + struct bio *bio = ap->a_bio; + struct bio *nbio; + struct buf *bp = bio->bio_buf; + struct vnode *vp = ap->a_vp; + struct tmpfs_node *node; + vm_object_t uobj; + vm_page_t m; + int i; + + if (vp->v_type != VREG) { + bp->b_resid = bp->b_bcount; + bp->b_flags |= B_ERROR | B_INVAL; + bp->b_error = EINVAL; + biodone(bio); + return(0); + } + + lwkt_gettoken(&vp->v_mount->mnt_token); + node = VP_TO_TMPFS_NODE(vp); + + uobj = node->tn_reg.tn_aobj; + + /* + * Certain operations such as nvtruncbuf() can result in a + * bdwrite() of one or more buffers related to the file, + * leading to the possibility of our strategy function + * being called for writing even when there is no swap space. + * + * When this case occurs we mark the underlying pages as valid + * and dirty and complete the I/O manually. + * + * Otherwise just call swap_pager_strategy to read or write, + * potentially assigning swap on write. We push a BIO to catch + * any swap allocation errors. + */ + if (bp->b_cmd == BUF_CMD_WRITE && vm_swap_size == 0) { + for (i = 0; i < bp->b_xio.xio_npages; ++i) { + m = bp->b_xio.xio_pages[i]; + vm_page_set_validdirty(m, 0, PAGE_SIZE); + } + bp->b_resid = 0; + bp->b_error = 0; + biodone(bio); + } else { + nbio = push_bio(bio); + nbio->bio_done = tmpfs_strategy_done; + nbio->bio_offset = bio->bio_offset; + swap_pager_strategy(uobj, nbio); + } + + lwkt_reltoken(&vp->v_mount->mnt_token); + return 0; +} + +/* + * bio finished. If we ran out of sap just mark the pages valid + * and dirty and make it appear that the I/O has completed successfully. + */ +static void +tmpfs_strategy_done(struct bio *bio) +{ + struct buf *bp; + vm_page_t m; + int i; + + bp = bio->bio_buf; + + if ((bp->b_flags & B_ERROR) && bp->b_error == ENOMEM) { + bp->b_flags &= ~B_ERROR; + bp->b_error = 0; + bp->b_resid = 0; + for (i = 0; i < bp->b_xio.xio_npages; ++i) { + m = bp->b_xio.xio_pages[i]; + vm_page_set_validdirty(m, 0, PAGE_SIZE); + } + } + bio = pop_bio(bio); + biodone(bio); +} + +static int +tmpfs_bmap(struct vop_bmap_args *ap) +{ + if (ap->a_doffsetp != NULL) + *ap->a_doffsetp = ap->a_loffset; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + + return 0; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_nremove(struct vop_nremove_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct namecache *ncp = v->a_nch->ncp; + struct vnode *vp; + int error; + struct tmpfs_dirent *de; + struct hammer2_mount *tmp; + struct tmpfs_node *dnode; + struct tmpfs_node *node; + + /* + * We have to acquire the vp from v->a_nch because we will likely + * unresolve the namecache entry, and a vrele/vput is needed to + * trigger the tmpfs_inactive/tmpfs_reclaim sequence. + * + * We have to use vget to clear any inactive state on the vnode, + * otherwise the vnode may remain inactive and thus tmpfs_inactive + * will not get called when we release it. + */ + error = cache_vget(v->a_nch, v->a_cred, LK_SHARED, &vp); + KKASSERT(error == 0); + vn_unlock(vp); + + if (vp->v_type == VDIR) { + error = EISDIR; + goto out; + } + + dnode = VP_TO_TMPFS_DIR(dvp); + node = VP_TO_TMPFS_NODE(vp); + tmp = VFS_TO_TMPFS(vp->v_mount); + de = tmpfs_dir_lookup(dnode, node, ncp); + if (de == NULL) { + error = ENOENT; + goto out; + } + + /* Files marked as immutable or append-only cannot be deleted. */ + if ((node->tn_flags & (IMMUTABLE | APPEND | NOUNLINK)) || + (dnode->tn_flags & APPEND)) { + error = EPERM; + goto out; + } + + /* Remove the entry from the directory; as it is a file, we do not + * have to change the number of hard links of the directory. */ + tmpfs_dir_detach(dnode, de); + + /* Free the directory entry we just deleted. Note that the node + * referred by it will not be removed until the vnode is really + * reclaimed. */ + tmpfs_free_dirent(tmp, de); + + if (node->tn_links > 0) { + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ + TMPFS_NODE_MODIFIED; + TMPFS_NODE_UNLOCK(node); + } + + cache_setunresolved(v->a_nch); + cache_setvp(v->a_nch, NULL); + tmpfs_knote(vp, NOTE_DELETE); + /*cache_inval_vp(vp, CINV_DESTROY);*/ + tmpfs_knote(dvp, NOTE_WRITE); + error = 0; + +out: + vrele(vp); + + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_nlink(struct vop_nlink_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct vnode *vp = v->a_vp; + struct namecache *ncp = v->a_nch->ncp; + struct tmpfs_dirent *de; + struct tmpfs_node *node; + struct tmpfs_node *dnode; + int error; + + KKASSERT(dvp != vp); /* XXX When can this be false? */ + + node = VP_TO_TMPFS_NODE(vp); + dnode = VP_TO_TMPFS_NODE(dvp); + + /* XXX: Why aren't the following two tests done by the caller? */ + + /* Hard links of directories are forbidden. */ + if (vp->v_type == VDIR) { + error = EPERM; + goto out; + } + + /* Cannot create cross-device links. */ + if (dvp->v_mount != vp->v_mount) { + error = EXDEV; + goto out; + } + + /* Ensure that we do not overflow the maximum number of links imposed + * by the system. */ + KKASSERT(node->tn_links <= LINK_MAX); + if (node->tn_links == LINK_MAX) { + error = EMLINK; + goto out; + } + + /* We cannot create links of files marked immutable or append-only. */ + if (node->tn_flags & (IMMUTABLE | APPEND)) { + error = EPERM; + goto out; + } + + /* Allocate a new directory entry to represent the node. */ + error = tmpfs_alloc_dirent(VFS_TO_TMPFS(vp->v_mount), node, + ncp->nc_name, ncp->nc_nlen, &de); + if (error != 0) + goto out; + + /* Insert the new directory entry into the appropriate directory. */ + tmpfs_dir_attach(dnode, de); + + /* vp link count has changed, so update node times. */ + + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_CHANGED; + TMPFS_NODE_UNLOCK(node); + tmpfs_update(vp); + + tmpfs_knote(vp, NOTE_LINK); + cache_setunresolved(v->a_nch); + cache_setvp(v->a_nch, vp); + tmpfs_knote(dvp, NOTE_WRITE); + error = 0; + +out: + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_nrename(struct vop_nrename_args *v) +{ + struct vnode *fdvp = v->a_fdvp; + struct namecache *fncp = v->a_fnch->ncp; + struct vnode *fvp = fncp->nc_vp; + struct vnode *tdvp = v->a_tdvp; + struct namecache *tncp = v->a_tnch->ncp; + struct vnode *tvp; + struct tmpfs_dirent *de; + struct hammer2_mount *tmp; + struct tmpfs_node *fdnode; + struct tmpfs_node *fnode; + struct tmpfs_node *tnode; + struct tmpfs_node *tdnode; + char *newname; + char *oldname; + int error; + + /* + * Because tvp can get overwritten we have to vget it instead of + * just vref or use it, otherwise it's VINACTIVE flag may not get + * cleared and the node won't get destroyed. + */ + error = cache_vget(v->a_tnch, v->a_cred, LK_SHARED, &tvp); + if (error == 0) { + tnode = VP_TO_TMPFS_NODE(tvp); + vn_unlock(tvp); + } else { + tnode = NULL; + } + + /* Disallow cross-device renames. + * XXX Why isn't this done by the caller? */ + if (fvp->v_mount != tdvp->v_mount || + (tvp != NULL && fvp->v_mount != tvp->v_mount)) { + error = EXDEV; + goto out; + } + + tmp = VFS_TO_TMPFS(tdvp->v_mount); + tdnode = VP_TO_TMPFS_DIR(tdvp); + + /* If source and target are the same file, there is nothing to do. */ + if (fvp == tvp) { + error = 0; + goto out; + } + + fdnode = VP_TO_TMPFS_DIR(fdvp); + fnode = VP_TO_TMPFS_NODE(fvp); + de = tmpfs_dir_lookup(fdnode, fnode, fncp); + + /* Avoid manipulating '.' and '..' entries. */ + if (de == NULL) { + error = ENOENT; + goto out_locked; + } + KKASSERT(de->td_node == fnode); + + /* + * If replacing an entry in the target directory and that entry + * is a directory, it must be empty. + * + * Kern_rename gurantees the destination to be a directory + * if the source is one (it does?). + */ + if (tvp != NULL) { + KKASSERT(tnode != NULL); + + if ((tnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || + (tdnode->tn_flags & (APPEND | IMMUTABLE))) { + error = EPERM; + goto out_locked; + } + + if (fnode->tn_type == VDIR && tnode->tn_type == VDIR) { + if (tnode->tn_size > 0) { + error = ENOTEMPTY; + goto out_locked; + } + } else if (fnode->tn_type == VDIR && tnode->tn_type != VDIR) { + error = ENOTDIR; + goto out_locked; + } else if (fnode->tn_type != VDIR && tnode->tn_type == VDIR) { + error = EISDIR; + goto out_locked; + } else { + KKASSERT(fnode->tn_type != VDIR && + tnode->tn_type != VDIR); + } + } + + if ((fnode->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) || + (fdnode->tn_flags & (APPEND | IMMUTABLE))) { + error = EPERM; + goto out_locked; + } + + /* + * Ensure that we have enough memory to hold the new name, if it + * has to be changed. + */ + if (fncp->nc_nlen != tncp->nc_nlen || + bcmp(fncp->nc_name, tncp->nc_name, fncp->nc_nlen) != 0) { + newname = kmalloc(tncp->nc_nlen + 1, tmp->tm_name_zone, + M_WAITOK | M_NULLOK); + if (newname == NULL) { + error = ENOSPC; + goto out_locked; + } + bcopy(tncp->nc_name, newname, tncp->nc_nlen); + newname[tncp->nc_nlen] = '\0'; + } else { + newname = NULL; + } + + /* + * Unlink entry from source directory. Note that the kernel has + * already checked for illegal recursion cases (renaming a directory + * into a subdirectory of itself). + */ + if (fdnode != tdnode) + tmpfs_dir_detach(fdnode, de); + + /* + * Handle any name change. Swap with newname, we will + * deallocate it at the end. + */ + if (newname != NULL) { +#if 0 + TMPFS_NODE_LOCK(fnode); + fnode->tn_status |= TMPFS_NODE_CHANGED; + TMPFS_NODE_UNLOCK(fnode); +#endif + oldname = de->td_name; + de->td_name = newname; + de->td_namelen = (uint16_t)tncp->nc_nlen; + newname = oldname; + } + + /* + * Link entry to target directory. If the entry + * represents a directory move the parent linkage + * as well. + */ + if (fdnode != tdnode) { + if (de->td_node->tn_type == VDIR) { + TMPFS_VALIDATE_DIR(fnode); + + TMPFS_NODE_LOCK(tdnode); + tdnode->tn_links++; + tdnode->tn_status |= TMPFS_NODE_MODIFIED; + TMPFS_NODE_UNLOCK(tdnode); + + TMPFS_NODE_LOCK(fnode); + fnode->tn_dir.tn_parent = tdnode; + fnode->tn_status |= TMPFS_NODE_CHANGED; + TMPFS_NODE_UNLOCK(fnode); + + TMPFS_NODE_LOCK(fdnode); + fdnode->tn_links--; + fdnode->tn_status |= TMPFS_NODE_MODIFIED; + TMPFS_NODE_UNLOCK(fdnode); + } + tmpfs_dir_attach(tdnode, de); + } else { + TMPFS_NODE_LOCK(tdnode); + tdnode->tn_status |= TMPFS_NODE_MODIFIED; + TMPFS_NODE_UNLOCK(tdnode); + } + + /* + * If we are overwriting an entry, we have to remove the old one + * from the target directory. + */ + if (tvp != NULL) { + /* Remove the old entry from the target directory. */ + de = tmpfs_dir_lookup(tdnode, tnode, tncp); + tmpfs_dir_detach(tdnode, de); + tmpfs_knote(tdnode->tn_vnode, NOTE_DELETE); + + /* + * Free the directory entry we just deleted. Note that the + * node referred by it will not be removed until the vnode is + * really reclaimed. + */ + tmpfs_free_dirent(VFS_TO_TMPFS(tvp->v_mount), de); + /*cache_inval_vp(tvp, CINV_DESTROY);*/ + } + + /* + * Finish up + */ + if (newname) { + kfree(newname, tmp->tm_name_zone); + newname = NULL; + } + cache_rename(v->a_fnch, v->a_tnch); + tmpfs_knote(v->a_fdvp, NOTE_WRITE); + tmpfs_knote(v->a_tdvp, NOTE_WRITE); + if (fnode->tn_vnode) + tmpfs_knote(fnode->tn_vnode, NOTE_RENAME); + error = 0; + +out_locked: + ; + +out: + if (tvp) + vrele(tvp); + + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_nmkdir(struct vop_nmkdir_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct vnode **vpp = v->a_vpp; + struct namecache *ncp = v->a_nch->ncp; + struct vattr *vap = v->a_vap; + struct ucred *cred = v->a_cred; + int error; + + KKASSERT(vap->va_type == VDIR); + + error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, NULL); + if (error == 0) { + cache_setunresolved(v->a_nch); + cache_setvp(v->a_nch, *vpp); + tmpfs_knote(dvp, NOTE_WRITE | NOTE_LINK); + } + + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_nrmdir(struct vop_nrmdir_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct namecache *ncp = v->a_nch->ncp; + struct vnode *vp; + struct tmpfs_dirent *de; + struct hammer2_mount *tmp; + struct tmpfs_node *dnode; + struct tmpfs_node *node; + int error; + + /* + * We have to acquire the vp from v->a_nch because we will likely + * unresolve the namecache entry, and a vrele/vput is needed to + * trigger the tmpfs_inactive/tmpfs_reclaim sequence. + * + * We have to use vget to clear any inactive state on the vnode, + * otherwise the vnode may remain inactive and thus tmpfs_inactive + * will not get called when we release it. + */ + error = cache_vget(v->a_nch, v->a_cred, LK_SHARED, &vp); + KKASSERT(error == 0); + vn_unlock(vp); + + /* + * Prevalidate so we don't hit an assertion later + */ + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + + tmp = VFS_TO_TMPFS(dvp->v_mount); + dnode = VP_TO_TMPFS_DIR(dvp); + node = VP_TO_TMPFS_DIR(vp); + + /* Directories with more than two entries ('.' and '..') cannot be + * removed. */ + if (node->tn_size > 0) { + error = ENOTEMPTY; + goto out; + } + + if ((dnode->tn_flags & APPEND) + || (node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND))) { + error = EPERM; + goto out; + } + + /* This invariant holds only if we are not trying to remove "..". + * We checked for that above so this is safe now. */ + KKASSERT(node->tn_dir.tn_parent == dnode); + + /* Get the directory entry associated with node (vp). This was + * filled by tmpfs_lookup while looking up the entry. */ + de = tmpfs_dir_lookup(dnode, node, ncp); + KKASSERT(TMPFS_DIRENT_MATCHES(de, + ncp->nc_name, + ncp->nc_nlen)); + + /* Check flags to see if we are allowed to remove the directory. */ + if ((dnode->tn_flags & APPEND) || + node->tn_flags & (NOUNLINK | IMMUTABLE | APPEND)) { + error = EPERM; + goto out; + } + + + /* Detach the directory entry from the directory (dnode). */ + tmpfs_dir_detach(dnode, de); + + /* No vnode should be allocated for this entry from this point */ + TMPFS_NODE_LOCK(node); + TMPFS_ASSERT_ELOCKED(node); + TMPFS_NODE_LOCK(dnode); + TMPFS_ASSERT_ELOCKED(dnode); + +#if 0 + /* handled by tmpfs_free_node */ + KKASSERT(node->tn_links > 0); + node->tn_links--; + node->tn_dir.tn_parent = NULL; +#endif + node->tn_status |= TMPFS_NODE_ACCESSED | TMPFS_NODE_CHANGED | \ + TMPFS_NODE_MODIFIED; + +#if 0 + /* handled by tmpfs_free_node */ + KKASSERT(dnode->tn_links > 0); + dnode->tn_links--; +#endif + dnode->tn_status |= TMPFS_NODE_ACCESSED | \ + TMPFS_NODE_CHANGED | TMPFS_NODE_MODIFIED; + + TMPFS_NODE_UNLOCK(dnode); + TMPFS_NODE_UNLOCK(node); + + /* Free the directory entry we just deleted. Note that the node + * referred by it will not be removed until the vnode is really + * reclaimed. */ + tmpfs_free_dirent(tmp, de); + + /* Release the deleted vnode (will destroy the node, notify + * interested parties and clean it from the cache). */ + + TMPFS_NODE_LOCK(dnode); + dnode->tn_status |= TMPFS_NODE_CHANGED; + TMPFS_NODE_UNLOCK(dnode); + tmpfs_update(dvp); + + cache_setunresolved(v->a_nch); + cache_setvp(v->a_nch, NULL); + /*cache_inval_vp(vp, CINV_DESTROY);*/ + tmpfs_knote(dvp, NOTE_WRITE | NOTE_LINK); + error = 0; + +out: + vrele(vp); + + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_nsymlink(struct vop_nsymlink_args *v) +{ + struct vnode *dvp = v->a_dvp; + struct vnode **vpp = v->a_vpp; + struct namecache *ncp = v->a_nch->ncp; + struct vattr *vap = v->a_vap; + struct ucred *cred = v->a_cred; + char *target = v->a_target; + int error; + + vap->va_type = VLNK; + error = tmpfs_alloc_file(dvp, vpp, vap, ncp, cred, target); + if (error == 0) { + tmpfs_knote(*vpp, NOTE_WRITE); + cache_setunresolved(v->a_nch); + cache_setvp(v->a_nch, *vpp); + } + + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_readdir(struct vop_readdir_args *v) +{ + struct vnode *vp = v->a_vp; + struct uio *uio = v->a_uio; + int *eofflag = v->a_eofflag; + off_t **cookies = v->a_cookies; + int *ncookies = v->a_ncookies; + struct hammer2_mount *tmp; + int error; + off_t startoff; + off_t cnt = 0; + struct tmpfs_node *node; + + /* This operation only makes sense on directory nodes. */ + if (vp->v_type != VDIR) + return ENOTDIR; + + tmp = VFS_TO_TMPFS(vp->v_mount); + node = VP_TO_TMPFS_DIR(vp); + startoff = uio->uio_offset; + + if (uio->uio_offset == TMPFS_DIRCOOKIE_DOT) { + error = tmpfs_dir_getdotdent(node, uio); + if (error != 0) + goto outok; + cnt++; + } + + if (uio->uio_offset == TMPFS_DIRCOOKIE_DOTDOT) { + error = tmpfs_dir_getdotdotdent(tmp, node, uio); + if (error != 0) + goto outok; + cnt++; + } + + error = tmpfs_dir_getdents(node, uio, &cnt); + +outok: + KKASSERT(error >= -1); + + if (error == -1) + error = 0; + + if (eofflag != NULL) + *eofflag = + (error == 0 && uio->uio_offset == TMPFS_DIRCOOKIE_EOF); + + /* Update NFS-related variables. */ + if (error == 0 && cookies != NULL && ncookies != NULL) { + off_t i; + off_t off = startoff; + struct tmpfs_dirent *de = NULL; + + *ncookies = cnt; + *cookies = kmalloc(cnt * sizeof(off_t), M_TEMP, M_WAITOK); + + for (i = 0; i < cnt; i++) { + KKASSERT(off != TMPFS_DIRCOOKIE_EOF); + if (off == TMPFS_DIRCOOKIE_DOT) { + off = TMPFS_DIRCOOKIE_DOTDOT; + } else { + if (off == TMPFS_DIRCOOKIE_DOTDOT) { + de = TAILQ_FIRST(&node->tn_dir.tn_dirhead); + } else if (de != NULL) { + de = TAILQ_NEXT(de, td_entries); + } else { + de = tmpfs_dir_lookupbycookie(node, + off); + KKASSERT(de != NULL); + de = TAILQ_NEXT(de, td_entries); + } + if (de == NULL) + off = TMPFS_DIRCOOKIE_EOF; + else + off = tmpfs_dircookie(de); + } + + (*cookies)[i] = off; + } + KKASSERT(uio->uio_offset == off); + } + + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_readlink(struct vop_readlink_args *v) +{ + struct vnode *vp = v->a_vp; + struct uio *uio = v->a_uio; + + int error; + struct tmpfs_node *node; + + KKASSERT(uio->uio_offset == 0); + KKASSERT(vp->v_type == VLNK); + + node = VP_TO_TMPFS_NODE(vp); + + error = uiomove(node->tn_link, MIN(node->tn_size, uio->uio_resid), + uio); + TMPFS_NODE_LOCK(node); + node->tn_status |= TMPFS_NODE_ACCESSED; + TMPFS_NODE_UNLOCK(node); + + return error; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_inactive(struct vop_inactive_args *v) +{ + struct vnode *vp = v->a_vp; + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + /* + * Degenerate case + */ + if (node == NULL) { + vrecycle(vp); + return(0); + } + + /* + * Get rid of unreferenced deleted vnodes sooner rather than + * later so the data memory can be recovered immediately. + * + * We must truncate the vnode to prevent the normal reclamation + * path from flushing the data for the removed file to disk. + */ + TMPFS_NODE_LOCK(node); + if ((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0 && + (node->tn_links == 0 || + (node->tn_links == 1 && node->tn_type == VDIR && + node->tn_dir.tn_parent))) + { + node->tn_vpstate = TMPFS_VNODE_DOOMED; + TMPFS_NODE_UNLOCK(node); + if (node->tn_type == VREG) + tmpfs_truncate(vp, 0); + vrecycle(vp); + } else { + TMPFS_NODE_UNLOCK(node); + } + + return 0; +} + +/* --------------------------------------------------------------------- */ + +int +tmpfs_reclaim(struct vop_reclaim_args *v) +{ + struct vnode *vp = v->a_vp; + struct hammer2_mount *tmp; + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + tmp = VFS_TO_TMPFS(vp->v_mount); + + tmpfs_free_vp(vp); + + /* + * If the node referenced by this vnode was deleted by the + * user, we must free its associated data structures now that + * the vnode is being reclaimed. + * + * Directories have an extra link ref. + */ + TMPFS_NODE_LOCK(node); + if ((node->tn_vpstate & TMPFS_VNODE_ALLOCATING) == 0 && + (node->tn_links == 0 || + (node->tn_links == 1 && node->tn_type == VDIR && + node->tn_dir.tn_parent))) + { + node->tn_vpstate = TMPFS_VNODE_DOOMED; + tmpfs_free_node(tmp, node); + /* eats the lock */ + } else { + TMPFS_NODE_UNLOCK(node); + } + + KKASSERT(vp->v_data == NULL); + return 0; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_print(struct vop_print_args *v) +{ + struct vnode *vp = v->a_vp; + + struct tmpfs_node *node; + + node = VP_TO_TMPFS_NODE(vp); + + kprintf("tag VT_TMPFS, tmpfs_node %p, flags 0x%x, links %d\n", + node, node->tn_flags, node->tn_links); + kprintf("\tmode 0%o, owner %d, group %d, size %ju, status 0x%x\n", + node->tn_mode, node->tn_uid, node->tn_gid, + (uintmax_t)node->tn_size, node->tn_status); + + if (vp->v_type == VFIFO) + fifo_printinfo(vp); + + kprintf("\n"); + + return 0; +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_pathconf(struct vop_pathconf_args *v) +{ + int name = v->a_name; + register_t *retval = v->a_retval; + + int error; + + error = 0; + + switch (name) { + case _PC_LINK_MAX: + *retval = LINK_MAX; + break; + + case _PC_NAME_MAX: + *retval = NAME_MAX; + break; + + case _PC_PATH_MAX: + *retval = PATH_MAX; + break; + + case _PC_PIPE_BUF: + *retval = PIPE_BUF; + break; + + case _PC_CHOWN_RESTRICTED: + *retval = 1; + break; + + case _PC_NO_TRUNC: + *retval = 1; + break; + + case _PC_SYNC_IO: + *retval = 1; + break; + + case _PC_FILESIZEBITS: + *retval = 0; /* XXX Don't know which value should I return. */ + break; + + default: + error = EINVAL; + } + + return error; +} + +/************************************************************************ + * KQFILTER OPS * + ************************************************************************/ + +static void filt_tmpfsdetach(struct knote *kn); +static int filt_tmpfsread(struct knote *kn, long hint); +static int filt_tmpfswrite(struct knote *kn, long hint); +static int filt_tmpfsvnode(struct knote *kn, long hint); + +static struct filterops tmpfsread_filtops = + { FILTEROP_ISFD, NULL, filt_tmpfsdetach, filt_tmpfsread }; +static struct filterops tmpfswrite_filtops = + { FILTEROP_ISFD, NULL, filt_tmpfsdetach, filt_tmpfswrite }; +static struct filterops tmpfsvnode_filtops = + { FILTEROP_ISFD, NULL, filt_tmpfsdetach, filt_tmpfsvnode }; + +static int +tmpfs_kqfilter (struct vop_kqfilter_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct knote *kn = ap->a_kn; + + switch (kn->kn_filter) { + case EVFILT_READ: + kn->kn_fop = &tmpfsread_filtops; + break; + case EVFILT_WRITE: + kn->kn_fop = &tmpfswrite_filtops; + break; + case EVFILT_VNODE: + kn->kn_fop = &tmpfsvnode_filtops; + break; + default: + return (EOPNOTSUPP); + } + + kn->kn_hook = (caddr_t)vp; + + knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); + + return(0); +} + +static void +filt_tmpfsdetach(struct knote *kn) +{ + struct vnode *vp = (void *)kn->kn_hook; + + knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn); +} + +static int +filt_tmpfsread(struct knote *kn, long hint) +{ + struct vnode *vp = (void *)kn->kn_hook; + struct tmpfs_node *node = VP_TO_TMPFS_NODE(vp); + off_t off; + + if (hint == NOTE_REVOKE) { + kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); + return(1); + } + + /* + * Interlock against MP races when performing this function. + */ + lwkt_gettoken(&vp->v_mount->mnt_token); + off = node->tn_size - kn->kn_fp->f_offset; + kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; + if (kn->kn_sfflags & NOTE_OLDAPI) { + lwkt_reltoken(&vp->v_mount->mnt_token); + return(1); + } + + if (kn->kn_data == 0) { + kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX; + } + lwkt_reltoken(&vp->v_mount->mnt_token); + return (kn->kn_data != 0); +} + +static int +filt_tmpfswrite(struct knote *kn, long hint) +{ + if (hint == NOTE_REVOKE) + kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT); + kn->kn_data = 0; + return (1); +} + +static int +filt_tmpfsvnode(struct knote *kn, long hint) +{ + if (kn->kn_sfflags & hint) + kn->kn_fflags |= hint; + if (hint == NOTE_REVOKE) { + kn->kn_flags |= (EV_EOF | EV_NODATA); + return (1); + } + return (kn->kn_fflags != 0); +} + + +/* --------------------------------------------------------------------- */ + +/* + * vnode operations vector used for files stored in a tmpfs file system. + */ +struct vop_ops tmpfs_vnode_vops = { + .vop_default = vop_defaultop, + .vop_getpages = vop_stdgetpages, + .vop_putpages = vop_stdputpages, + .vop_ncreate = tmpfs_ncreate, + .vop_nresolve = tmpfs_nresolve, + .vop_nlookupdotdot = tmpfs_nlookupdotdot, + .vop_nmknod = tmpfs_nmknod, + .vop_open = tmpfs_open, + .vop_close = tmpfs_close, + .vop_access = tmpfs_access, + .vop_getattr = tmpfs_getattr, + .vop_setattr = tmpfs_setattr, + .vop_read = tmpfs_read, + .vop_write = tmpfs_write, + .vop_fsync = tmpfs_fsync, + .vop_nremove = tmpfs_nremove, + .vop_nlink = tmpfs_nlink, + .vop_nrename = tmpfs_nrename, + .vop_nmkdir = tmpfs_nmkdir, + .vop_nrmdir = tmpfs_nrmdir, + .vop_nsymlink = tmpfs_nsymlink, + .vop_readdir = tmpfs_readdir, + .vop_readlink = tmpfs_readlink, + .vop_inactive = tmpfs_inactive, + .vop_reclaim = tmpfs_reclaim, + .vop_print = tmpfs_print, + .vop_pathconf = tmpfs_pathconf, + .vop_bmap = tmpfs_bmap, + .vop_strategy = tmpfs_strategy, + .vop_advlock = tmpfs_advlock, + .vop_kqfilter = tmpfs_kqfilter +}; +/* $NetBSD: tmpfs_fifoops.c,v 1.5 2005/12/11 12:24:29 christos Exp $ */ + +/*- + * Copyright (c) 2005 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Julio M. Merino Vidal, developed as part of Google's Summer of Code + * 2005 program. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * tmpfs vnode interface for named pipes. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include "hammer2.h" + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_fifo_kqfilter(struct vop_kqfilter_args *ap) +{ + struct vnode *vp; + struct tmpfs_node *node; + + vp = ap->a_vp; + node = VP_TO_TMPFS_NODE(vp); + + switch (ap->a_kn->kn_filter){ + case EVFILT_READ: + node->tn_status |= TMPFS_NODE_ACCESSED; + break; + case EVFILT_WRITE: + node->tn_status |= TMPFS_NODE_MODIFIED; + break; + } + + return fifo_vnode_vops.vop_kqfilter(ap); +} + +/* --------------------------------------------------------------------- */ + +static int +tmpfs_fifo_close(struct vop_close_args *v) +{ + struct tmpfs_node *node; + node = VP_TO_TMPFS_NODE(v->a_vp); + node->tn_status |= TMPFS_NODE_ACCESSED; + + tmpfs_update(v->a_vp); + return fifo_vnode_vops.vop_close(v); +} + +/* + * vnode operations vector used for fifos stored in a tmpfs file system. + */ +struct vop_ops tmpfs_fifo_vops = { + .vop_default = fifo_vnoperate, + .vop_close = tmpfs_fifo_close, + .vop_reclaim = tmpfs_reclaim, + .vop_access = tmpfs_access, + .vop_getattr = tmpfs_getattr, + .vop_setattr = tmpfs_setattr, + .vop_kqfilter = tmpfs_fifo_kqfilter, +}; -- 2.41.0