hammer2 - Major restructuring, part 5/several
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * Kernel Filesystem interface
37  *
38  * NOTE! local ipdata pointers must be reloaded on any modifying operation
39  *       to the inode as its underlying chain may have changed.
40  */
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/proc.h>
48 #include <sys/namei.h>
49 #include <sys/mount.h>
50 #include <sys/vnode.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include "hammer2.h"
56
57 #define ZFOFFSET        (-2LL)
58
59 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
60                                 int seqcount);
61 static int hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
62                                 struct uio *uio, int ioflag, int seqcount);
63 static hammer2_off_t hammer2_assign_physical(hammer2_trans_t *trans,
64                                 hammer2_inode_t *ip,
65                                 hammer2_key_t lbase, int lblksize,
66                                 int *errorp);
67 static void hammer2_extend_file(hammer2_trans_t *trans,
68                                 hammer2_inode_t *ip, hammer2_key_t nsize);
69 static void hammer2_truncate_file(hammer2_trans_t *trans,
70                                 hammer2_inode_t *ip, hammer2_key_t nsize);
71
72 static __inline
73 void
74 hammer2_knote(struct vnode *vp, int flags)
75 {
76         if (flags)
77                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
78 }
79
80 /*
81  * Last reference to a vnode is going away but it is still cached.
82  */
83 static
84 int
85 hammer2_vop_inactive(struct vop_inactive_args *ap)
86 {
87         hammer2_inode_t *ip;
88         hammer2_trans_t trans;
89         struct vnode *vp;
90 #if 0
91         struct hammer2_mount *hmp;
92 #endif
93
94         vp = ap->a_vp;
95         ip = VTOI(vp);
96
97         /*
98          * Degenerate case
99          */
100         if (ip == NULL) {
101                 vrecycle(vp);
102                 return (0);
103         }
104
105         /*
106          * Detect updates to the embedded data which may be synchronized by
107          * the strategy code.  Simply mark the inode modified so it gets
108          * picked up by our normal flush.
109          */
110         hammer2_inode_lock_ex(ip);
111         KKASSERT(ip->chain);
112         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
113                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
114                 hammer2_trans_init(&trans, ip->hmp);
115                 hammer2_chain_modify(&trans, ip->chain, 0);
116                 hammer2_trans_done(&trans);
117         }
118
119         /*
120          * Check for deleted inodes and recycle immediately.
121          */
122         if (ip->chain->flags & HAMMER2_CHAIN_DELETED) {
123                 hammer2_inode_unlock_ex(ip);
124                 vrecycle(vp);
125         } else {
126                 hammer2_inode_unlock_ex(ip);
127         }
128         return (0);
129 }
130
131 /*
132  * Reclaim a vnode so that it can be reused; after the inode is
133  * disassociated, the filesystem must manage it alone.
134  */
135 static
136 int
137 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
138 {
139         hammer2_chain_t *chain;
140         hammer2_inode_t *ip;
141         hammer2_mount_t *hmp;
142         hammer2_trans_t trans;
143         struct vnode *vp;
144
145         vp = ap->a_vp;
146         ip = VTOI(vp);
147         if (ip == NULL)
148                 return(0);
149         hmp = ip->hmp;
150
151         /*
152          * Set SUBMODIFIED so we can detect and propagate the DESTROYED
153          * bit in the flush code.
154          *
155          * ip->chain might be stale, correct it before checking as older
156          * versions of the chain are likely marked deleted even if the
157          * file hasn't been.  XXX ip->chain should never be stale on
158          * reclaim.
159          */
160         hammer2_inode_lock_ex(ip);
161         chain = ip->chain;
162         if (chain->duplink)
163                 kprintf("RECLAIM DUPLINKED IP: %p %p\n", ip, ip->chain);
164 #if 0
165         while (chain->duplink)
166                 chain = chain->duplink;
167         if (ip->chain != chain) {
168                 hammer2_inode_repoint(ip, ip->pip, chain);
169                 chain = ip->chain;
170         }
171 #endif
172
173         /*
174          * The final close of a deleted file or directory marks it for
175          * destruction.  The DESTROYED flag allows the flusher to shortcut
176          * any modified blocks still unflushed (that is, just ignore them).
177          *
178          * HAMMER2 usually does not try to optimize the freemap by returning
179          * deleted blocks to it as it does not usually know how many snapshots
180          * might be referencing portions of the file/dir.  XXX TODO.
181          *
182          * XXX TODO - However, any modified file as-of when a snapshot is made
183          *            cannot use this optimization as some of the modifications
184          *            may wind up being part of the snapshot.
185          */
186         vp->v_data = NULL;
187         ip->vp = NULL;
188         if (chain->flags & HAMMER2_CHAIN_DELETED) {
189                 KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED);
190                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROYED |
191                                               HAMMER2_CHAIN_SUBMODIFIED);
192         }
193         if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
194                             HAMMER2_CHAIN_DELETED |
195                             HAMMER2_CHAIN_SUBMODIFIED)) {
196                 hammer2_trans_init(&trans, ip->hmp);
197                 hammer2_chain_flush(&trans, chain);
198                 hammer2_trans_done(&trans);
199         }
200         if (ip->refs > 2)                           /* (our lock + vp ref) */
201                 hammer2_inode_unlock_ex(ip);        /* unlock */
202         else
203                 hammer2_inode_put(ip);              /* unlock & disconnect */
204         /* chain no longer referenced */
205         /* chain = NULL; not needed */
206         hammer2_inode_drop(ip);                     /* vp ref */
207
208         /*
209          * XXX handle background sync when ip dirty, kernel will no longer
210          * notify us regarding this inode because there is no longer a
211          * vnode attached to it.
212          */
213
214         return (0);
215 }
216
217 static
218 int
219 hammer2_vop_fsync(struct vop_fsync_args *ap)
220 {
221         hammer2_inode_t *ip;
222         hammer2_trans_t trans;
223         struct vnode *vp;
224
225         vp = ap->a_vp;
226         ip = VTOI(vp);
227
228         hammer2_trans_init(&trans, ip->hmp);
229         hammer2_inode_lock_ex(ip);
230
231         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
232
233         /*
234          * Detect updates to the embedded data which may be synchronized by
235          * the strategy code.  Simply mark the inode modified so it gets
236          * picked up by our normal flush.
237          */
238         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
239                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
240                 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
241                 hammer2_chain_modify(&trans, ip->chain, 0);
242         }
243
244         /*
245          * Calling chain_flush here creates a lot of duplicative
246          * COW operations due to non-optimal vnode ordering.
247          *
248          * Only do it for an actual fsync() syscall.  The other forms
249          * which call this function will eventually call chain_flush
250          * on the volume root as a catch-all, which is far more optimal.
251          */
252         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
253                 atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
254                 hammer2_chain_flush(&trans, ip->chain);
255         }
256         hammer2_inode_unlock_ex(ip);
257         hammer2_trans_done(&trans);
258         return (0);
259 }
260
261 static
262 int
263 hammer2_vop_access(struct vop_access_args *ap)
264 {
265         hammer2_inode_t *ip = VTOI(ap->a_vp);
266         hammer2_inode_data_t *ipdata;
267         uid_t uid;
268         gid_t gid;
269         int error;
270
271         hammer2_inode_lock_sh(ip);
272         ipdata = &ip->chain->data->ipdata;
273         uid = hammer2_to_unix_xid(&ipdata->uid);
274         gid = hammer2_to_unix_xid(&ipdata->gid);
275         error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
276         hammer2_inode_unlock_sh(ip);
277
278         return (error);
279 }
280
281 static
282 int
283 hammer2_vop_getattr(struct vop_getattr_args *ap)
284 {
285         hammer2_inode_data_t *ipdata;
286         hammer2_pfsmount_t *pmp;
287         hammer2_inode_t *ip;
288         struct vnode *vp;
289         struct vattr *vap;
290
291         vp = ap->a_vp;
292         vap = ap->a_vap;
293
294         ip = VTOI(vp);
295         pmp = ip->pmp;
296
297         hammer2_inode_lock_sh(ip);
298         ipdata = &ip->chain->data->ipdata;
299
300         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
301         vap->va_fileid = ipdata->inum;
302         vap->va_mode = ipdata->mode;
303         vap->va_nlink = ipdata->nlinks;
304         vap->va_uid = hammer2_to_unix_xid(&ipdata->uid);
305         vap->va_gid = hammer2_to_unix_xid(&ipdata->gid);
306         vap->va_rmajor = 0;
307         vap->va_rminor = 0;
308         vap->va_size = ipdata->size;
309         vap->va_blocksize = HAMMER2_PBUFSIZE;
310         vap->va_flags = ipdata->uflags;
311         hammer2_time_to_timespec(ipdata->ctime, &vap->va_ctime);
312         hammer2_time_to_timespec(ipdata->mtime, &vap->va_mtime);
313         hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
314         vap->va_gen = 1;
315         vap->va_bytes = vap->va_size;   /* XXX */
316         vap->va_type = hammer2_get_vtype(ip->chain);
317         vap->va_filerev = 0;
318         vap->va_uid_uuid = ipdata->uid;
319         vap->va_gid_uuid = ipdata->gid;
320         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
321                           VA_FSID_UUID_VALID;
322
323         hammer2_inode_unlock_sh(ip);
324
325         return (0);
326 }
327
328 static
329 int
330 hammer2_vop_setattr(struct vop_setattr_args *ap)
331 {
332         hammer2_inode_data_t *ipdata;
333         hammer2_inode_t *ip;
334         hammer2_mount_t *hmp;
335         hammer2_trans_t trans;
336         struct vnode *vp;
337         struct vattr *vap;
338         int error;
339         int kflags = 0;
340         int domtime = 0;
341         uint64_t ctime;
342
343         vp = ap->a_vp;
344         vap = ap->a_vap;
345         hammer2_update_time(&ctime);
346
347         ip = VTOI(vp);
348         hmp = ip->hmp;
349
350         if (hmp->ronly)
351                 return(EROFS);
352
353         hammer2_trans_init(&trans, hmp);
354         hammer2_inode_lock_ex(ip);
355         ipdata = &ip->chain->data->ipdata;
356         error = 0;
357
358         if (vap->va_flags != VNOVAL) {
359                 u_int32_t flags;
360
361                 flags = ipdata->uflags;
362                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
363                                          hammer2_to_unix_xid(&ipdata->uid),
364                                          ap->a_cred);
365                 if (error == 0) {
366                         if (ipdata->uflags != flags) {
367                                 hammer2_chain_modify(&trans, ip->chain, 0);
368                                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
369                                 ipdata->uflags = flags;
370                                 ipdata->ctime = ctime;
371                                 kflags |= NOTE_ATTRIB;
372                         }
373                         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
374                                 error = 0;
375                                 goto done;
376                         }
377                 }
378                 goto done;
379         }
380         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
381                 error = EPERM;
382                 goto done;
383         }
384         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
385                 mode_t cur_mode = ipdata->mode;
386                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
387                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
388                 uuid_t uuid_uid;
389                 uuid_t uuid_gid;
390
391                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
392                                          ap->a_cred,
393                                          &cur_uid, &cur_gid, &cur_mode);
394                 if (error == 0) {
395                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
396                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
397                         if (bcmp(&uuid_uid, &ipdata->uid, sizeof(uuid_uid)) ||
398                             bcmp(&uuid_gid, &ipdata->gid, sizeof(uuid_gid)) ||
399                             ipdata->mode != cur_mode
400                         ) {
401                                 hammer2_chain_modify(&trans, ip->chain, 0);
402                                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
403                                 ipdata->uid = uuid_uid;
404                                 ipdata->gid = uuid_gid;
405                                 ipdata->mode = cur_mode;
406                                 ipdata->ctime = ctime;
407                         }
408                         kflags |= NOTE_ATTRIB;
409                 }
410         }
411
412         /*
413          * Resize the file
414          */
415         if (vap->va_size != VNOVAL && ipdata->size != vap->va_size) {
416                 switch(vp->v_type) {
417                 case VREG:
418                         if (vap->va_size == ipdata->size)
419                                 break;
420                         if (vap->va_size < ipdata->size) {
421                                 hammer2_truncate_file(&trans, ip, vap->va_size);
422                         } else {
423                                 hammer2_extend_file(&trans, ip, vap->va_size);
424                         }
425                         ipdata = &ip->chain->data->ipdata; /* RELOAD */
426                         domtime = 1;
427                         break;
428                 default:
429                         error = EINVAL;
430                         goto done;
431                 }
432         }
433 #if 0
434         /* atime not supported */
435         if (vap->va_atime.tv_sec != VNOVAL) {
436                 hammer2_chain_modify(&trans, ip->chain, 0);
437                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
438                 ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
439                 kflags |= NOTE_ATTRIB;
440         }
441 #endif
442         if (vap->va_mtime.tv_sec != VNOVAL) {
443                 hammer2_chain_modify(&trans, ip->chain, 0);
444                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
445                 ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
446                 kflags |= NOTE_ATTRIB;
447         }
448         if (vap->va_mode != (mode_t)VNOVAL) {
449                 mode_t cur_mode = ipdata->mode;
450                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
451                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
452
453                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
454                                          cur_uid, cur_gid, &cur_mode);
455                 if (error == 0 && ipdata->mode != cur_mode) {
456                         hammer2_chain_modify(&trans, ip->chain, 0);
457                         ipdata = &ip->chain->data->ipdata; /* RELOAD */
458                         ipdata->mode = cur_mode;
459                         ipdata->ctime = ctime;
460                         kflags |= NOTE_ATTRIB;
461                 }
462         }
463 done:
464         hammer2_inode_unlock_ex(ip);
465         hammer2_trans_done(&trans);
466         return (error);
467 }
468
469 static
470 int
471 hammer2_vop_readdir(struct vop_readdir_args *ap)
472 {
473         hammer2_inode_data_t *ipdata;
474         hammer2_mount_t *hmp;
475         hammer2_inode_t *ip;
476         hammer2_inode_t *xip;
477         hammer2_chain_t *parent;
478         hammer2_chain_t *chain;
479         hammer2_tid_t inum;
480         hammer2_key_t lkey;
481         struct uio *uio;
482         off_t *cookies;
483         off_t saveoff;
484         int cookie_index;
485         int ncookies;
486         int error;
487         int dtype;
488         int r;
489
490         ip = VTOI(ap->a_vp);
491         hmp = ip->hmp;
492         uio = ap->a_uio;
493         saveoff = uio->uio_offset;
494
495         /*
496          * Setup cookies directory entry cookies if requested
497          */
498         if (ap->a_ncookies) {
499                 ncookies = uio->uio_resid / 16 + 1;
500                 if (ncookies > 1024)
501                         ncookies = 1024;
502                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
503         } else {
504                 ncookies = -1;
505                 cookies = NULL;
506         }
507         cookie_index = 0;
508
509         hammer2_inode_lock_sh(ip);
510         ipdata = &ip->chain->data->ipdata;
511
512         /*
513          * Handle artificial entries.  To ensure that only positive 64 bit
514          * quantities are returned to userland we always strip off bit 63.
515          * The hash code is designed such that codes 0x0000-0x7FFF are not
516          * used, allowing us to use these codes for articial entries.
517          *
518          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
519          * allow '..' to cross the mount point into (e.g.) the super-root.
520          */
521         error = 0;
522         chain = (void *)(intptr_t)-1;   /* non-NULL for early goto done case */
523
524         if (saveoff == 0) {
525                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
526                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
527                 if (r)
528                         goto done;
529                 if (cookies)
530                         cookies[cookie_index] = saveoff;
531                 ++saveoff;
532                 ++cookie_index;
533                 if (cookie_index == ncookies)
534                         goto done;
535         }
536
537         if (saveoff == 1) {
538                 /*
539                  * Be careful with lockorder when accessing ".."
540                  *
541                  * (ip is the current dir. xip is the parent dir).
542                  */
543                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
544                 while (ip->pip != NULL && ip != ip->pmp->iroot) {
545                         xip = ip->pip;
546                         hammer2_inode_ref(xip);
547                         hammer2_inode_unlock_sh(ip);
548                         hammer2_inode_lock_sh(xip);
549                         hammer2_inode_lock_sh(ip);
550                         hammer2_inode_drop(xip);
551                         if (xip == ip->pip) {
552                                 inum = xip->chain->data->ipdata.inum &
553                                        HAMMER2_DIRHASH_USERMSK;
554                                 hammer2_inode_unlock_sh(xip);
555                                 break;
556                         }
557                         hammer2_inode_unlock_sh(xip);
558                 }
559                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
560                 if (r)
561                         goto done;
562                 if (cookies)
563                         cookies[cookie_index] = saveoff;
564                 ++saveoff;
565                 ++cookie_index;
566                 if (cookie_index == ncookies)
567                         goto done;
568         }
569
570         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
571
572         /*
573          * parent is the inode chain, already locked for us.  Don't
574          * double lock shared locks as this will screw up upgrades.
575          */
576         if (error) {
577                 goto done;
578         }
579         parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
580         chain = hammer2_chain_lookup(&parent, lkey, lkey,
581                                      HAMMER2_LOOKUP_SHARED);
582         if (chain == NULL) {
583                 chain = hammer2_chain_lookup(&parent,
584                                              lkey, (hammer2_key_t)-1,
585                                              HAMMER2_LOOKUP_SHARED);
586         }
587         while (chain) {
588                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
589                         dtype = hammer2_get_dtype(chain);
590                         saveoff = chain->bref.key & HAMMER2_DIRHASH_USERMSK;
591                         r = vop_write_dirent(&error, uio,
592                                              chain->data->ipdata.inum &
593                                               HAMMER2_DIRHASH_USERMSK,
594                                              dtype,
595                                              chain->data->ipdata.name_len,
596                                              chain->data->ipdata.filename);
597                         if (r)
598                                 break;
599                         if (cookies)
600                                 cookies[cookie_index] = saveoff;
601                         ++cookie_index;
602                 } else {
603                         /* XXX chain error */
604                         kprintf("bad chain type readdir %d\n",
605                                 chain->bref.type);
606                 }
607
608                 /*
609                  * Keys may not be returned in order so once we have a
610                  * placemarker (chain) the scan must allow the full range
611                  * or some entries will be missed.
612                  */
613                 chain = hammer2_chain_next(&parent, chain,
614                                            HAMMER2_DIRHASH_VISIBLE,
615                                            (hammer2_key_t)-1,
616                                            HAMMER2_LOOKUP_SHARED);
617                 if (chain) {
618                         saveoff = (chain->bref.key &
619                                    HAMMER2_DIRHASH_USERMSK) + 1;
620                 } else {
621                         saveoff = (hammer2_key_t)-1;
622                 }
623                 if (cookie_index == ncookies)
624                         break;
625         }
626         if (chain)
627                 hammer2_chain_unlock(chain);
628         hammer2_chain_lookup_done(parent);
629 done:
630         hammer2_inode_unlock_sh(ip);
631         if (ap->a_eofflag)
632                 *ap->a_eofflag = (chain == NULL);
633         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
634         if (error && cookie_index == 0) {
635                 if (cookies) {
636                         kfree(cookies, M_TEMP);
637                         *ap->a_ncookies = 0;
638                         *ap->a_cookies = NULL;
639                 }
640         } else {
641                 if (cookies) {
642                         *ap->a_ncookies = cookie_index;
643                         *ap->a_cookies = cookies;
644                 }
645         }
646         return (error);
647 }
648
649 /*
650  * hammer2_vop_readlink { vp, uio, cred }
651  */
652 static
653 int
654 hammer2_vop_readlink(struct vop_readlink_args *ap)
655 {
656         struct vnode *vp;
657         hammer2_mount_t *hmp;
658         hammer2_inode_t *ip;
659         int error;
660
661         vp = ap->a_vp;
662         if (vp->v_type != VLNK)
663                 return (EINVAL);
664         ip = VTOI(vp);
665         hmp = ip->hmp;
666
667         error = hammer2_read_file(ip, ap->a_uio, 0);
668         return (error);
669 }
670
671 static
672 int
673 hammer2_vop_read(struct vop_read_args *ap)
674 {
675         struct vnode *vp;
676         hammer2_mount_t *hmp;
677         hammer2_inode_t *ip;
678         struct uio *uio;
679         int error;
680         int seqcount;
681         int bigread;
682
683         /*
684          * Read operations supported on this vnode?
685          */
686         vp = ap->a_vp;
687         if (vp->v_type != VREG)
688                 return (EINVAL);
689
690         /*
691          * Misc
692          */
693         ip = VTOI(vp);
694         hmp = ip->hmp;
695         uio = ap->a_uio;
696         error = 0;
697
698         seqcount = ap->a_ioflag >> 16;
699         bigread = (uio->uio_resid > 100 * 1024 * 1024);
700
701         error = hammer2_read_file(ip, uio, seqcount);
702         return (error);
703 }
704
705 static
706 int
707 hammer2_vop_write(struct vop_write_args *ap)
708 {
709         hammer2_mount_t *hmp;
710         hammer2_inode_t *ip;
711         hammer2_trans_t trans;
712         thread_t td;
713         struct vnode *vp;
714         struct uio *uio;
715         int error;
716         int seqcount;
717         int bigwrite;
718
719         /*
720          * Read operations supported on this vnode?
721          */
722         vp = ap->a_vp;
723         if (vp->v_type != VREG)
724                 return (EINVAL);
725
726         /*
727          * Misc
728          */
729         ip = VTOI(vp);
730         hmp = ip->hmp;
731         uio = ap->a_uio;
732         error = 0;
733         if (hmp->ronly)
734                 return (EROFS);
735
736         seqcount = ap->a_ioflag >> 16;
737         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
738
739         /*
740          * Check resource limit
741          */
742         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
743             uio->uio_offset + uio->uio_resid >
744              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
745                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
746                 return (EFBIG);
747         }
748
749         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
750
751         /*
752          * ip must be locked if extending the file.
753          * ip must be locked to avoid racing a truncation.
754          *
755          * ip must be marked modified, particularly because the write
756          * might wind up being copied into the embedded data area.
757          */
758         hammer2_inode_lock_ex(ip);
759         hammer2_trans_init(&trans, ip->hmp);
760         error = hammer2_write_file(ip, &trans, uio, ap->a_ioflag, seqcount);
761         hammer2_inode_unlock_ex(ip);
762         hammer2_trans_done(&trans);
763
764         return (error);
765 }
766
767 /*
768  * Perform read operations on a file or symlink given an UNLOCKED
769  * inode and uio.
770  *
771  * The passed ip is not locked.
772  */
773 static
774 int
775 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
776 {
777         hammer2_off_t size;
778         struct buf *bp;
779         int error;
780
781         error = 0;
782
783         /*
784          * UIO read loop.
785          */
786         hammer2_inode_lock_sh(ip);
787         size = ip->chain->data->ipdata.size;
788
789         while (uio->uio_resid > 0 && uio->uio_offset < size) {
790                 hammer2_key_t lbase;
791                 hammer2_key_t leof;
792                 int lblksize;
793                 int loff;
794                 int n;
795
796                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
797                                                 &lbase, &leof);
798
799                 error = cluster_read(ip->vp, leof, lbase, lblksize,
800                                      uio->uio_resid, seqcount * BKVASIZE,
801                                      &bp);
802
803                 if (error)
804                         break;
805                 loff = (int)(uio->uio_offset - lbase);
806                 n = lblksize - loff;
807                 if (n > uio->uio_resid)
808                         n = uio->uio_resid;
809                 if (n > size - uio->uio_offset)
810                         n = (int)(size - uio->uio_offset);
811                 bp->b_flags |= B_AGE;
812                 uiomove((char *)bp->b_data + loff, n, uio);
813                 bqrelse(bp);
814         }
815         hammer2_inode_unlock_sh(ip);
816         return (error);
817 }
818
819 /*
820  * Called with a locked (ip) to do the underlying write to a file or
821  * to build the symlink target.
822  */
823 static
824 int
825 hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
826                    struct uio *uio, int ioflag, int seqcount)
827 {
828         hammer2_inode_data_t *ipdata;
829         hammer2_key_t old_eof;
830         struct buf *bp;
831         int kflags;
832         int error;
833         int modified = 0;
834
835         /*
836          * Setup if append
837          */
838         ipdata = &ip->chain->data->ipdata;
839         if (ioflag & IO_APPEND)
840                 uio->uio_offset = ipdata->size;
841         kflags = 0;
842         error = 0;
843
844         /*
845          * Extend the file if necessary.  If the write fails at some point
846          * we will truncate it back down to cover as much as we were able
847          * to write.
848          *
849          * Doing this now makes it easier to calculate buffer sizes in
850          * the loop.
851          */
852         KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
853         old_eof = ipdata->size;
854         if (uio->uio_offset + uio->uio_resid > ipdata->size) {
855                 modified = 1;
856                 hammer2_extend_file(trans, ip,
857                                     uio->uio_offset + uio->uio_resid);
858                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
859                 kflags |= NOTE_EXTEND;
860         }
861         KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
862
863         /*
864          * UIO write loop
865          */
866         while (uio->uio_resid > 0) {
867                 hammer2_key_t lbase;
868                 hammer2_key_t leof;
869                 int trivial;
870                 int lblksize;
871                 int loff;
872                 int n;
873
874                 /*
875                  * Don't allow the buffer build to blow out the buffer
876                  * cache.
877                  */
878                 if ((ioflag & IO_RECURSE) == 0) {
879                         /*
880                          * XXX should try to leave this unlocked through
881                          *      the whole loop
882                          */
883                         hammer2_inode_unlock_ex(ip);
884                         bwillwrite(HAMMER2_PBUFSIZE);
885                         hammer2_inode_lock_ex(ip);
886                         ipdata = &ip->chain->data->ipdata;      /* reload */
887                 }
888
889                 /* XXX bigwrite & signal check test */
890
891                 /*
892                  * This nominally tells us how much we can cluster and
893                  * what the logical buffer size needs to be.  Currently
894                  * we don't try to cluster the write and just handle one
895                  * block at a time.
896                  */
897                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
898                                                 &lbase, &leof);
899                 loff = (int)(uio->uio_offset - lbase);
900
901                 /*
902                  * Calculate bytes to copy this transfer and whether the
903                  * copy completely covers the buffer or not.
904                  */
905                 trivial = 0;
906                 n = lblksize - loff;
907                 if (n > uio->uio_resid) {
908                         n = uio->uio_resid;
909                         if (uio->uio_offset + n == ipdata->size)
910                                 trivial = 1;
911                 } else if (loff == 0) {
912                         trivial = 1;
913                 }
914
915                 /*
916                  * Get the buffer
917                  */
918                 if (uio->uio_segflg == UIO_NOCOPY) {
919                         /*
920                          * Issuing a write with the same data backing the
921                          * buffer.  Instantiate the buffer to collect the
922                          * backing vm pages, then read-in any missing bits.
923                          *
924                          * This case is used by vop_stdputpages().
925                          */
926                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
927                         if ((bp->b_flags & B_CACHE) == 0) {
928                                 bqrelse(bp);
929                                 error = bread(ip->vp, lbase, lblksize, &bp);
930                         }
931                 } else if (trivial) {
932                         /*
933                          * Even though we are entirely overwriting the buffer
934                          * we may still have to zero it out to avoid a
935                          * mmap/write visibility issue.
936                          */
937                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
938                         if ((bp->b_flags & B_CACHE) == 0)
939                                 vfs_bio_clrbuf(bp);
940                 } else {
941                         /*
942                          * Partial overwrite, read in any missing bits then
943                          * replace the portion being written.
944                          *
945                          * (The strategy code will detect zero-fill physical
946                          * blocks for this case).
947                          */
948                         error = bread(ip->vp, lbase, lblksize, &bp);
949                         if (error == 0)
950                                 bheavy(bp);
951                 }
952
953                 if (error) {
954                         brelse(bp);
955                         break;
956                 }
957
958                 /*
959                  * We have to assign physical storage to the buffer we intend
960                  * to dirty or write now to avoid deadlocks in the strategy
961                  * code later.
962                  *
963                  * This can return NOOFFSET for inode-embedded data.  The
964                  * strategy code will take care of it in that case.
965                  */
966                 bp->b_bio2.bio_offset =
967                         hammer2_assign_physical(trans, ip,
968                                                 lbase, lblksize, &error);
969                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
970                 if (error) {
971                         brelse(bp);
972                         break;
973                 }
974
975                 /*
976                  * Ok, copy the data in
977                  */
978                 hammer2_inode_unlock_ex(ip);
979                 error = uiomove(bp->b_data + loff, n, uio);
980                 hammer2_inode_lock_ex(ip);
981                 ipdata = &ip->chain->data->ipdata;      /* reload */
982                 kflags |= NOTE_WRITE;
983                 modified = 1;
984
985                 if (error) {
986                         brelse(bp);
987                         break;
988                 }
989
990                 /* XXX update ip_data.mtime */
991
992                 /*
993                  * Once we dirty a buffer any cached offset becomes invalid.
994                  *
995                  * NOTE: For cluster_write() always use the trailing block
996                  *       size, which is HAMMER2_PBUFSIZE.  lblksize is the
997                  *       eof-straddling blocksize and is incorrect.
998                  */
999                 bp->b_flags |= B_AGE;
1000                 if ((ioflag & IO_SYNC) ||
1001                     (lbase == 0 && (ipdata->op_flags &
1002                                     HAMMER2_OPFLAG_DIRECTDATA))) {
1003                         /*
1004                          * Synchronous I/O requested or writing to the
1005                          * inode's embedded data (which must be synchronous).
1006                          */
1007                         bwrite(bp);
1008                 } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
1009                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1010                                 bp->b_flags |= B_CLUSTEROK;
1011                         bdwrite(bp);
1012                 } else if (ioflag & IO_ASYNC) {
1013                         bawrite(bp);
1014                 } else if (hammer2_cluster_enable) {
1015                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1016                                 bp->b_flags |= B_CLUSTEROK;
1017                         cluster_write(bp, leof, HAMMER2_PBUFSIZE, seqcount);
1018                 } else {
1019                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1020                                 bp->b_flags |= B_CLUSTEROK;
1021                         bdwrite(bp);
1022                 }
1023         }
1024
1025         /*
1026          * Cleanup.  If we extended the file EOF but failed to write through
1027          * the entire write is a failure and we have to back-up.
1028          */
1029         if (error && ipdata->size != old_eof) {
1030                 hammer2_truncate_file(trans, ip, old_eof);
1031                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1032         } else if (modified) {
1033                 hammer2_chain_modify(trans, ip->chain, 0);
1034                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1035                 hammer2_update_time(&ipdata->mtime);
1036         }
1037         hammer2_knote(ip->vp, kflags);
1038
1039         return error;
1040 }
1041
1042 /*
1043  * Assign physical storage to a logical block.  This function creates the
1044  * related meta-data chains representing the data blocks and marks them
1045  * MODIFIED.  We could mark them MOVED instead but ultimately I need to
1046  * XXX code the flusher to check that the related logical buffer is
1047  * flushed.
1048  *
1049  * NOOFFSET is returned if the data is inode-embedded.  In this case the
1050  * strategy code will simply bcopy() the data into the inode.
1051  *
1052  * The inode's delta_dcount is adjusted.
1053  */
1054 static
1055 hammer2_off_t
1056 hammer2_assign_physical(hammer2_trans_t *trans, hammer2_inode_t *ip,
1057                         hammer2_key_t lbase, int lblksize, int *errorp)
1058 {
1059         hammer2_mount_t *hmp;
1060         hammer2_chain_t *parent;
1061         hammer2_chain_t *chain;
1062         hammer2_off_t pbase;
1063
1064         /*
1065          * Locate the chain associated with lbase, return a locked chain.
1066          * However, do not instantiate any data reference (which utilizes a
1067          * device buffer) because we will be using direct IO via the
1068          * logical buffer cache buffer.
1069          */
1070         hmp = ip->hmp;
1071         *errorp = 0;
1072 retry:
1073         hammer2_inode_lock_ex(ip);
1074         parent = hammer2_chain_lookup_init(ip->chain, 0);
1075         chain = hammer2_chain_lookup(&parent,
1076                                      lbase, lbase,
1077                                      HAMMER2_LOOKUP_NODATA);
1078
1079         if (chain == NULL) {
1080                 /*
1081                  * We found a hole, create a new chain entry.
1082                  *
1083                  * NOTE: DATA chains are created without device backing
1084                  *       store (nor do we want any).
1085                  */
1086                 *errorp = hammer2_chain_create(trans, &parent, &chain,
1087                                                lbase, HAMMER2_PBUFRADIX,
1088                                                HAMMER2_BREF_TYPE_DATA,
1089                                                lblksize);
1090                 if (chain == NULL) {
1091                         hammer2_inode_unlock_ex(ip);
1092                         hammer2_chain_lookup_done(parent);
1093                         panic("hammer2_chain_create: par=%p error=%d\n",
1094                                 parent, *errorp);
1095                         goto retry;
1096                 }
1097
1098                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1099                 /*ip->delta_dcount += lblksize;*/
1100         } else {
1101                 switch (chain->bref.type) {
1102                 case HAMMER2_BREF_TYPE_INODE:
1103                         /*
1104                          * The data is embedded in the inode.  The
1105                          * caller is responsible for marking the inode
1106                          * modified and copying the data to the embedded
1107                          * area.
1108                          */
1109                         pbase = NOOFFSET;
1110                         break;
1111                 case HAMMER2_BREF_TYPE_DATA:
1112                         if (chain->bytes != lblksize) {
1113                                 panic("hammer2_assign_physical: "
1114                                       "size mismatch %d/%d\n",
1115                                       lblksize, chain->bytes);
1116                         }
1117                         hammer2_chain_modify(trans, chain,
1118                                              HAMMER2_MODIFY_OPTDATA);
1119                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1120                         break;
1121                 default:
1122                         panic("hammer2_assign_physical: bad type");
1123                         /* NOT REACHED */
1124                         pbase = NOOFFSET;
1125                         break;
1126                 }
1127         }
1128         if (chain)
1129                 hammer2_chain_unlock(chain);
1130         hammer2_chain_lookup_done(parent);
1131
1132         hammer2_inode_unlock_ex(ip);
1133
1134         return (pbase);
1135 }
1136
1137 /*
1138  * Truncate the size of a file.
1139  *
1140  * This routine adjusts ipdata->size smaller, destroying any related
1141  * data beyond the new EOF and potentially resizing the block straddling
1142  * the EOF.
1143  *
1144  * The inode must be locked.
1145  */
1146 static
1147 void
1148 hammer2_truncate_file(hammer2_trans_t *trans,
1149                       hammer2_inode_t *ip, hammer2_key_t nsize)
1150 {
1151         hammer2_inode_data_t *ipdata;
1152         hammer2_chain_t *parent;
1153         hammer2_chain_t *chain;
1154         hammer2_key_t lbase;
1155         hammer2_key_t leof;
1156         struct buf *bp;
1157         int loff;
1158         int error;
1159         int oblksize;
1160         int nblksize;
1161
1162         hammer2_chain_modify(trans, ip->chain, 0);
1163         bp = NULL;
1164         ipdata = &ip->chain->data->ipdata;
1165         error = 0;
1166
1167         /*
1168          * Destroy any logical buffer cache buffers beyond the file EOF.
1169          *
1170          * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
1171          * around with the buffer straddling EOF, because we need to assign
1172          * a new physical offset to it.
1173          */
1174         if (ip->vp) {
1175                 nvtruncbuf(ip->vp, nsize,
1176                            HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
1177                            1);
1178         }
1179
1180         /*
1181          * Setup for lookup/search
1182          */
1183         parent = hammer2_chain_lookup_init(ip->chain, 0);
1184
1185         /*
1186          * Handle the case where a chain/logical-buffer straddles the new
1187          * EOF.  We told nvtruncbuf() above not to mess with the logical
1188          * buffer straddling the EOF because we need to reassign its storage
1189          * and can't let the strategy code do it for us.
1190          */
1191         loff = (int)nsize & HAMMER2_PBUFMASK;
1192         if (loff && ip->vp) {
1193                 oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1194                 error = bread(ip->vp, lbase, oblksize, &bp);
1195                 KKASSERT(error == 0);
1196         }
1197         ipdata->size = nsize;
1198         nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1199
1200         /*
1201          * Fixup the chain element.  If we have a logical buffer in-hand
1202          * we don't want to create a conflicting device buffer.
1203          */
1204         if (loff && bp) {
1205                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
1206                                              HAMMER2_LOOKUP_NODATA);
1207                 if (chain) {
1208                         switch(chain->bref.type) {
1209                         case HAMMER2_BREF_TYPE_DATA:
1210                                 hammer2_chain_resize(trans, ip, bp,
1211                                              parent, &chain,
1212                                              hammer2_allocsize(nblksize),
1213                                              HAMMER2_MODIFY_OPTDATA);
1214                                 allocbuf(bp, nblksize);
1215                                 bzero(bp->b_data + loff, nblksize - loff);
1216                                 bp->b_bio2.bio_offset = chain->bref.data_off &
1217                                                         HAMMER2_OFF_MASK;
1218                                 break;
1219                         case HAMMER2_BREF_TYPE_INODE:
1220                                 allocbuf(bp, nblksize);
1221                                 bzero(bp->b_data + loff, nblksize - loff);
1222                                 bp->b_bio2.bio_offset = NOOFFSET;
1223                                 break;
1224                         default:
1225                                 panic("hammer2_truncate_file: bad type");
1226                                 break;
1227                         }
1228                         hammer2_chain_unlock(chain);
1229                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1230                                 bp->b_flags |= B_CLUSTEROK;
1231                         if (lbase == 0 &&
1232                             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA)) {
1233                                 /*
1234                                  * Must be synchronous if writing to the
1235                                  * inode's embedded data area.
1236                                  */
1237                                 bwrite(bp);
1238                         } else {
1239                                 /*
1240                                  * Else a delayed-write is fine.
1241                                  */
1242                                 bdwrite(bp);
1243                         }
1244                 } else {
1245                         /*
1246                          * Destroy clean buffer w/ wrong buffer size.  Retain
1247                          * backing store.
1248                          */
1249                         bp->b_flags |= B_RELBUF;
1250                         KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
1251                         KKASSERT((bp->b_flags & B_DIRTY) == 0);
1252                         bqrelse(bp);
1253                 }
1254         } else if (loff) {
1255                 /*
1256                  * WARNING: This utilizes a device buffer for the data.
1257                  *
1258                  * This case should not occur because file truncations without
1259                  * a vnode (and hence no logical buffer cache) should only
1260                  * always truncate to 0-length.
1261                  */
1262                 panic("hammer2_truncate_file: non-zero truncation, no-vnode");
1263 #if 0
1264                 chain = hammer2_chain_lookup(&parent, lbase, lbase, 0);
1265                 if (chain) {
1266                         switch(chain->bref.type) {
1267                         case HAMMER2_BREF_TYPE_DATA:
1268                                 chain = hammer2_chain_resize(trans, ip, bp,
1269                                              parent, chain,
1270                                              hammer2_allocsize(nblksize),
1271                                              0);
1272                                 hammer2_chain_modify(hmp, chain, 0);
1273                                 bzero(chain->data->buf + loff, nblksize - loff);
1274                                 break;
1275                         case HAMMER2_BREF_TYPE_INODE:
1276                                 if (loff < HAMMER2_EMBEDDED_BYTES) {
1277                                         hammer2_chain_modify(hmp, chain, 0);
1278                                         bzero(chain->data->ipdata.u.data + loff,
1279                                               HAMMER2_EMBEDDED_BYTES - loff);
1280                                 }
1281                                 break;
1282                         }
1283                         hammer2_chain_unlock(chain);
1284                 }
1285 #endif
1286         }
1287
1288         /*
1289          * Clean up any fragmentory VM pages now that we have properly
1290          * resized the straddling buffer.  These pages are no longer
1291          * part of the buffer.
1292          */
1293         if (ip->vp) {
1294                 nvtruncbuf(ip->vp, nsize,
1295                            nblksize, (int)nsize & (nblksize - 1),
1296                            1);
1297         }
1298
1299         /*
1300          * Destroy any physical blocks after the new EOF point.
1301          */
1302         lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
1303         chain = hammer2_chain_lookup(&parent,
1304                                      lbase, (hammer2_key_t)-1,
1305                                      HAMMER2_LOOKUP_NODATA);
1306         while (chain) {
1307                 /*
1308                  * Degenerate embedded data case, nothing to loop on.
1309                  */
1310                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1311                         hammer2_chain_unlock(chain);
1312                         break;
1313                 }
1314
1315                 /*
1316                  * Delete physical data blocks past the file EOF.
1317                  */
1318                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1319                         /*ip->delta_dcount -= chain->bytes;*/
1320                         hammer2_chain_delete(trans, parent, chain);
1321                 }
1322                 /* XXX check parent if empty indirect block & delete */
1323                 chain = hammer2_chain_next(&parent, chain,
1324                                            lbase, (hammer2_key_t)-1,
1325                                            HAMMER2_LOOKUP_NODATA);
1326         }
1327         hammer2_chain_lookup_done(parent);
1328 }
1329
1330 /*
1331  * Extend the size of a file.  The inode must be locked.
1332  *
1333  * We may have to resize the block straddling the old EOF.
1334  */
1335 static
1336 void
1337 hammer2_extend_file(hammer2_trans_t *trans,
1338                     hammer2_inode_t *ip, hammer2_key_t nsize)
1339 {
1340         hammer2_inode_data_t *ipdata;
1341         hammer2_mount_t *hmp;
1342         hammer2_chain_t *parent;
1343         hammer2_chain_t *chain;
1344         struct buf *bp;
1345         hammer2_key_t osize;
1346         hammer2_key_t obase;
1347         hammer2_key_t nbase;
1348         hammer2_key_t leof;
1349         int oblksize;
1350         int nblksize;
1351         int nradix;
1352         int error;
1353
1354         KKASSERT(ip->vp);
1355         hmp = ip->hmp;
1356
1357         hammer2_chain_modify(trans, ip->chain, 0);
1358         ipdata = &ip->chain->data->ipdata;
1359
1360         /*
1361          * Nothing to do if the direct-data case is still intact
1362          */
1363         if ((ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1364             nsize <= HAMMER2_EMBEDDED_BYTES) {
1365                 ipdata->size = nsize;
1366                 nvextendbuf(ip->vp,
1367                             ipdata->size, nsize,
1368                             0, HAMMER2_EMBEDDED_BYTES,
1369                             0, (int)nsize,
1370                             1);
1371                 /* ipdata = &ip->chain->data->ipdata; RELOAD */
1372                 return;
1373         }
1374
1375         /*
1376          * Calculate the blocksize at the original EOF and resize the block
1377          * if necessary.  Adjust the file size in the inode.
1378          */
1379         osize = ipdata->size;
1380         oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
1381         ipdata->size = nsize;
1382         nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
1383
1384         /*
1385          * Do all required vnode operations, but do not mess with the
1386          * buffer straddling the orignal EOF.
1387          */
1388         nvextendbuf(ip->vp,
1389                     ipdata->size, nsize,
1390                     0, nblksize,
1391                     0, (int)nsize & HAMMER2_PBUFMASK,
1392                     1);
1393         ipdata = &ip->chain->data->ipdata;
1394
1395         /*
1396          * Early return if we have no more work to do.
1397          */
1398         if (obase == nbase && oblksize == nblksize &&
1399             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1400                 return;
1401         }
1402
1403         /*
1404          * We have work to do, including possibly resizing the buffer
1405          * at the previous EOF point and turning off DIRECTDATA mode.
1406          */
1407         bp = NULL;
1408         if (((int)osize & HAMMER2_PBUFMASK)) {
1409                 error = bread(ip->vp, obase, oblksize, &bp);
1410                 KKASSERT(error == 0);
1411         }
1412
1413         /*
1414          * Disable direct-data mode by loading up a buffer cache buffer
1415          * with the data, then converting the inode data area into the
1416          * inode indirect block array area.
1417          */
1418         if (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1419                 ipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1420                 bzero(&ipdata->u.blockset, sizeof(ipdata->u.blockset));
1421         }
1422
1423         /*
1424          * Resize the chain element at the old EOF.
1425          */
1426         if (((int)osize & HAMMER2_PBUFMASK)) {
1427 retry:
1428                 error = 0;
1429                 parent = hammer2_chain_lookup_init(ip->chain, 0);
1430                 nradix = hammer2_allocsize(nblksize);
1431
1432                 chain = hammer2_chain_lookup(&parent,
1433                                              obase, obase,
1434                                              HAMMER2_LOOKUP_NODATA);
1435                 if (chain == NULL) {
1436                         error = hammer2_chain_create(trans, &parent, &chain,
1437                                                      obase, nblksize,
1438                                                      HAMMER2_BREF_TYPE_DATA,
1439                                                      nblksize);
1440                         if (chain == NULL) {
1441                                 hammer2_chain_lookup_done(parent);
1442                                 panic("hammer2_chain_create: par=%p error=%d\n",
1443                                         parent, error);
1444                                 goto retry;
1445                         }
1446                         /*ip->delta_dcount += nblksize;*/
1447                 } else {
1448                         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1449                         hammer2_chain_resize(trans, ip, bp,
1450                                              parent, &chain,
1451                                              nradix,
1452                                              HAMMER2_MODIFY_OPTDATA);
1453                 }
1454                 if (obase != nbase) {
1455                         if (oblksize != HAMMER2_PBUFSIZE)
1456                                 allocbuf(bp, HAMMER2_PBUFSIZE);
1457                 } else {
1458                         if (oblksize != nblksize)
1459                                 allocbuf(bp, nblksize);
1460                 }
1461                 bp->b_bio2.bio_offset = chain->bref.data_off &
1462                                         HAMMER2_OFF_MASK;
1463                 hammer2_chain_unlock(chain);
1464                 if (bp->b_bcount == HAMMER2_PBUFSIZE)
1465                         bp->b_flags |= B_CLUSTEROK;
1466                 bdwrite(bp);
1467                 hammer2_chain_lookup_done(parent);  /* must be after bdwrite */
1468         }
1469 }
1470
1471 static
1472 int
1473 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1474 {
1475         hammer2_inode_t *ip;
1476         hammer2_inode_t *dip;
1477         hammer2_mount_t *hmp;
1478         hammer2_chain_t *parent;
1479         hammer2_chain_t *chain;
1480         hammer2_chain_t *ochain;
1481         hammer2_trans_t trans;
1482         struct namecache *ncp;
1483         const uint8_t *name;
1484         size_t name_len;
1485         hammer2_key_t lhc;
1486         int error = 0;
1487         struct vnode *vp;
1488
1489         dip = VTOI(ap->a_dvp);
1490         hmp = dip->hmp;
1491         ncp = ap->a_nch->ncp;
1492         name = ncp->nc_name;
1493         name_len = ncp->nc_nlen;
1494         lhc = hammer2_dirhash(name, name_len);
1495
1496         /*
1497          * Note: In DragonFly the kernel handles '.' and '..'.
1498          */
1499         hammer2_inode_lock_sh(dip);
1500         parent = hammer2_chain_lookup_init(dip->chain, HAMMER2_LOOKUP_SHARED);
1501         chain = hammer2_chain_lookup(&parent,
1502                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1503                                      HAMMER2_LOOKUP_SHARED);
1504         while (chain) {
1505                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1506                     name_len == chain->data->ipdata.name_len &&
1507                     bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1508                         break;
1509                 }
1510                 chain = hammer2_chain_next(&parent, chain,
1511                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1512                                            HAMMER2_LOOKUP_SHARED);
1513         }
1514         hammer2_chain_lookup_done(parent);
1515         hammer2_inode_unlock_sh(dip);
1516
1517         /*
1518          * If the inode represents a forwarding entry for a hardlink we have
1519          * to locate the actual inode.  The original ip is saved for possible
1520          * deconsolidation.  (ip) will only be set to non-NULL when we have
1521          * to locate the real file via a hardlink.  ip will be referenced but
1522          * not locked in that situation.  chain is passed in locked and
1523          * returned locked.
1524          *
1525          * XXX what kind of chain lock?
1526          */
1527         ochain = NULL;
1528         if (chain && chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
1529                 error = hammer2_hardlink_find(dip, &chain, &ochain);
1530                 if (error) {
1531                         kprintf("hammer2: unable to find hardlink\n");
1532                         if (chain) {
1533                                 hammer2_chain_unlock(chain);
1534                                 chain = NULL;
1535                         }
1536                         goto failed;
1537                 }
1538         }
1539
1540         /*
1541          * Deconsolidate any hardlink whos nlinks == 1.  Ignore errors.
1542          * If an error occurs chain and ip are left alone.
1543          *
1544          * XXX upgrade shared lock?
1545          */
1546         if (ochain && chain && chain->data->ipdata.nlinks == 1 && !hmp->ronly) {
1547                 kprintf("hammer2: need to unconsolidate hardlink for %s\n",
1548                         chain->data->ipdata.filename);
1549                 /* XXX retain shared lock on dip? (currently not held) */
1550                 hammer2_trans_init(&trans, dip->hmp);
1551                 hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain);
1552                 hammer2_trans_done(&trans);
1553         }
1554
1555         /*
1556          * Acquire the related vnode
1557          *
1558          * NOTE: For error processing, only ENOENT resolves the namecache
1559          *       entry to NULL, otherwise we just return the error and
1560          *       leave the namecache unresolved.
1561          *
1562          * NOTE: multiple hammer2_inode structures can be aliased to the
1563          *       same chain element, for example for hardlinks.  This
1564          *       use case does not 'reattach' inode associations that
1565          *       might already exist, but always allocates a new one.
1566          *
1567          * WARNING: inode structure is locked exclusively via inode_get
1568          *          but chain was locked shared.  inode_unlock_ex()
1569          *          will handle it properly.
1570          */
1571         if (chain) {
1572                 ip = hammer2_inode_get(hmp, dip->pmp, dip, chain);
1573                 vp = hammer2_igetv(ip, &error);
1574                 if (error == 0) {
1575                         vn_unlock(vp);
1576                         cache_setvp(ap->a_nch, vp);
1577                 } else if (error == ENOENT) {
1578                         cache_setvp(ap->a_nch, NULL);
1579                 }
1580                 hammer2_inode_unlock_ex(ip);
1581
1582                 /*
1583                  * The vp should not be released until after we've disposed
1584                  * of our locks, because it might cause vop_inactive() to
1585                  * be called.
1586                  */
1587                 if (vp)
1588                         vrele(vp);
1589         } else {
1590                 error = ENOENT;
1591                 cache_setvp(ap->a_nch, NULL);
1592         }
1593 failed:
1594         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1595                 ("resolve error %d/%p chain %p ap %p\n",
1596                  error, ap->a_nch->ncp->nc_vp, chain, ap));
1597         if (ochain)
1598                 hammer2_chain_drop(ochain);
1599         return error;
1600 }
1601
1602 static
1603 int
1604 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1605 {
1606         hammer2_inode_t *dip;
1607         hammer2_inode_t *ip;
1608         hammer2_mount_t *hmp;
1609         int error;
1610
1611         dip = VTOI(ap->a_dvp);
1612         hmp = dip->hmp;
1613
1614         if ((ip = dip->pip) == NULL) {
1615                 *ap->a_vpp = NULL;
1616                 return ENOENT;
1617         }
1618         hammer2_inode_lock_ex(ip);
1619         *ap->a_vpp = hammer2_igetv(ip, &error);
1620         hammer2_inode_unlock_ex(ip);
1621
1622         return error;
1623 }
1624
1625 static
1626 int
1627 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1628 {
1629         hammer2_mount_t *hmp;
1630         hammer2_inode_t *dip;
1631         hammer2_inode_t *nip;
1632         hammer2_trans_t trans;
1633         struct namecache *ncp;
1634         const uint8_t *name;
1635         size_t name_len;
1636         int error;
1637
1638         dip = VTOI(ap->a_dvp);
1639         hmp = dip->hmp;
1640         if (hmp->ronly)
1641                 return (EROFS);
1642
1643         ncp = ap->a_nch->ncp;
1644         name = ncp->nc_name;
1645         name_len = ncp->nc_nlen;
1646
1647         hammer2_trans_init(&trans, hmp);
1648         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1649                                    name, name_len, &error);
1650         if (error) {
1651                 KKASSERT(nip == NULL);
1652                 *ap->a_vpp = NULL;
1653         } else {
1654                 *ap->a_vpp = hammer2_igetv(nip, &error);
1655                 hammer2_inode_unlock_ex(nip);
1656         }
1657         hammer2_trans_done(&trans);
1658
1659         if (error == 0) {
1660                 cache_setunresolved(ap->a_nch);
1661                 cache_setvp(ap->a_nch, *ap->a_vpp);
1662         }
1663         return error;
1664 }
1665
1666 /*
1667  * Return the largest contiguous physical disk range for the logical
1668  * request.
1669  *
1670  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
1671  */
1672 static
1673 int
1674 hammer2_vop_bmap(struct vop_bmap_args *ap)
1675 {
1676         struct vnode *vp;
1677         hammer2_mount_t *hmp;
1678         hammer2_inode_t *ip;
1679         hammer2_chain_t *parent;
1680         hammer2_chain_t *chain;
1681         hammer2_key_t lbeg;
1682         hammer2_key_t lend;
1683         hammer2_off_t pbeg;
1684         hammer2_off_t pbytes;
1685         hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
1686         int loff;
1687         int ai;
1688
1689         /*
1690          * Only supported on regular files
1691          *
1692          * Only supported for read operations (required for cluster_read).
1693          * The block allocation is delayed for write operations.
1694          */
1695         vp = ap->a_vp;
1696         if (vp->v_type != VREG)
1697                 return (EOPNOTSUPP);
1698         if (ap->a_cmd != BUF_CMD_READ)
1699                 return (EOPNOTSUPP);
1700
1701         ip = VTOI(vp);
1702         hmp = ip->hmp;
1703         bzero(array, sizeof(array));
1704
1705         /*
1706          * Calculate logical range
1707          */
1708         KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
1709         lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
1710         lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
1711         if (lend < lbeg)
1712                 lend = lbeg;
1713         loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
1714
1715         hammer2_inode_lock_sh(ip);
1716         parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
1717         chain = hammer2_chain_lookup(&parent,
1718                                      lbeg, lend,
1719                                      HAMMER2_LOOKUP_NODATA |
1720                                      HAMMER2_LOOKUP_SHARED);
1721         if (chain == NULL) {
1722                 *ap->a_doffsetp = ZFOFFSET;
1723                 hammer2_chain_lookup_done(parent);
1724                 hammer2_inode_unlock_sh(ip);
1725                 return (0);
1726         }
1727
1728         while (chain) {
1729                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1730                         ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
1731                         KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
1732                         array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
1733                         array[ai][1] = chain->bytes;
1734                 }
1735                 chain = hammer2_chain_next(&parent, chain,
1736                                            lbeg, lend,
1737                                            HAMMER2_LOOKUP_NODATA |
1738                                            HAMMER2_LOOKUP_SHARED);
1739         }
1740         hammer2_chain_lookup_done(parent);
1741         hammer2_inode_unlock_sh(ip);
1742
1743         /*
1744          * If the requested loffset is not mappable physically we can't
1745          * bmap.  The caller will have to access the file data via a
1746          * device buffer.
1747          */
1748         if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_LBUFSIZE) {
1749                 *ap->a_doffsetp = NOOFFSET;
1750                 return (0);
1751         }
1752
1753         /*
1754          * Calculate the physical disk offset range for array[0]
1755          */
1756         pbeg = array[0][0] + loff;
1757         pbytes = array[0][1] - loff;
1758
1759         for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
1760                 if (array[ai][0] != pbeg + pbytes)
1761                         break;
1762                 pbytes += array[ai][1];
1763         }
1764
1765         *ap->a_doffsetp = pbeg;
1766         if (ap->a_runp)
1767                 *ap->a_runp = pbytes;
1768         return (0);
1769 }
1770
1771 static
1772 int
1773 hammer2_vop_open(struct vop_open_args *ap)
1774 {
1775         return vop_stdopen(ap);
1776 }
1777
1778 /*
1779  * hammer2_vop_advlock { vp, id, op, fl, flags }
1780  */
1781 static
1782 int
1783 hammer2_vop_advlock(struct vop_advlock_args *ap)
1784 {
1785         hammer2_inode_t *ip = VTOI(ap->a_vp);
1786         hammer2_off_t size;
1787
1788         hammer2_inode_lock_sh(ip);
1789         size = ip->chain->data->ipdata.size;
1790         hammer2_inode_unlock_sh(ip);
1791         return (lf_advlock(ap, &ip->advlock, size));
1792 }
1793
1794
1795 static
1796 int
1797 hammer2_vop_close(struct vop_close_args *ap)
1798 {
1799         return vop_stdclose(ap);
1800 }
1801
1802 /*
1803  * hammer2_vop_nlink { nch, dvp, vp, cred }
1804  *
1805  * Create a hardlink from (vp) to {dvp, nch}.
1806  */
1807 static
1808 int
1809 hammer2_vop_nlink(struct vop_nlink_args *ap)
1810 {
1811         hammer2_inode_t *dip;   /* target directory to create link in */
1812         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1813         hammer2_mount_t *hmp;
1814         hammer2_chain_t *chain;
1815         hammer2_trans_t trans;
1816         struct namecache *ncp;
1817         const uint8_t *name;
1818         size_t name_len;
1819         int error;
1820
1821         dip = VTOI(ap->a_dvp);
1822         hmp = dip->hmp;
1823         if (hmp->ronly)
1824                 return (EROFS);
1825
1826         ncp = ap->a_nch->ncp;
1827         name = ncp->nc_name;
1828         name_len = ncp->nc_nlen;
1829         hammer2_trans_init(&trans, hmp);
1830
1831         /*
1832          * ip represents the file being hardlinked.  The file could be a
1833          * normal file or a hardlink target if it has already been hardlinked.
1834          * If ip is a hardlinked target then ip->pip represents the location
1835          * of the hardlinked target, NOT the location of the hardlink pointer.
1836          *
1837          * Bump nlinks and potentially also create or move the hardlink
1838          * target in the parent directory common to (ip) and (dip).  The
1839          * consolidation code can modify ip->chain and ip->pip.  The
1840          * returned chain is locked.
1841          */
1842         ip = VTOI(ap->a_vp);
1843         hammer2_inode_lock_ex(ip);
1844         error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1);
1845         if (error)
1846                 goto done;
1847
1848         /*
1849          * Create a directory entry connected to the specified chain.
1850          * The hardlink consolidation code has already adjusted ip->pip
1851          * to the common parent directory containing the actual hardlink
1852          *
1853          * (which may be different from dip where we created our hardlink
1854          * entry. ip->chain always represents the actual hardlink and not
1855          * any of the pointers to the actual hardlink).
1856          */
1857         error = hammer2_inode_connect(&trans, 1,
1858                                       dip, &chain,
1859                                       name, name_len);
1860         if (error == 0) {
1861                 cache_setunresolved(ap->a_nch);
1862                 cache_setvp(ap->a_nch, ap->a_vp);
1863         }
1864 done:
1865         if (chain)
1866                 hammer2_chain_unlock(chain);
1867         hammer2_inode_unlock_ex(ip);
1868         hammer2_trans_done(&trans);
1869
1870         return error;
1871 }
1872
1873 /*
1874  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1875  *
1876  * The operating system has already ensured that the directory entry
1877  * does not exist and done all appropriate namespace locking.
1878  */
1879 static
1880 int
1881 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1882 {
1883         hammer2_mount_t *hmp;
1884         hammer2_inode_t *dip;
1885         hammer2_inode_t *nip;
1886         hammer2_trans_t trans;
1887         struct namecache *ncp;
1888         const uint8_t *name;
1889         size_t name_len;
1890         int error;
1891
1892         dip = VTOI(ap->a_dvp);
1893         hmp = dip->hmp;
1894         if (hmp->ronly)
1895                 return (EROFS);
1896
1897         ncp = ap->a_nch->ncp;
1898         name = ncp->nc_name;
1899         name_len = ncp->nc_nlen;
1900         hammer2_trans_init(&trans, hmp);
1901
1902         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1903                                    name, name_len, &error);
1904         if (error) {
1905                 KKASSERT(nip == NULL);
1906                 *ap->a_vpp = NULL;
1907         } else {
1908                 *ap->a_vpp = hammer2_igetv(nip, &error);
1909                 hammer2_inode_unlock_ex(nip);
1910         }
1911         hammer2_trans_done(&trans);
1912
1913         if (error == 0) {
1914                 cache_setunresolved(ap->a_nch);
1915                 cache_setvp(ap->a_nch, *ap->a_vpp);
1916         }
1917         return error;
1918 }
1919
1920 /*
1921  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1922  */
1923 static
1924 int
1925 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1926 {
1927         hammer2_mount_t *hmp;
1928         hammer2_inode_t *dip;
1929         hammer2_inode_t *nip;
1930         hammer2_trans_t trans;
1931         struct namecache *ncp;
1932         const uint8_t *name;
1933         size_t name_len;
1934         int error;
1935
1936         dip = VTOI(ap->a_dvp);
1937         hmp = dip->hmp;
1938         if (hmp->ronly)
1939                 return (EROFS);
1940
1941         ncp = ap->a_nch->ncp;
1942         name = ncp->nc_name;
1943         name_len = ncp->nc_nlen;
1944         hammer2_trans_init(&trans, hmp);
1945
1946         ap->a_vap->va_type = VLNK;      /* enforce type */
1947
1948         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1949                                    name, name_len, &error);
1950         if (error) {
1951                 KKASSERT(nip == NULL);
1952                 *ap->a_vpp = NULL;
1953                 hammer2_trans_done(&trans);
1954                 return error;
1955         }
1956         *ap->a_vpp = hammer2_igetv(nip, &error);
1957
1958         /*
1959          * Build the softlink (~like file data) and finalize the namecache.
1960          */
1961         if (error == 0) {
1962                 size_t bytes;
1963                 struct uio auio;
1964                 struct iovec aiov;
1965                 hammer2_inode_data_t *nipdata;
1966
1967                 nipdata = &nip->chain->data->ipdata;
1968                 bytes = strlen(ap->a_target);
1969
1970                 if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1971                         KKASSERT(nipdata->op_flags &
1972                                  HAMMER2_OPFLAG_DIRECTDATA);
1973                         bcopy(ap->a_target, nipdata->u.data, bytes);
1974                         nipdata->size = bytes;
1975                 } else {
1976                         bzero(&auio, sizeof(auio));
1977                         bzero(&aiov, sizeof(aiov));
1978                         auio.uio_iov = &aiov;
1979                         auio.uio_segflg = UIO_SYSSPACE;
1980                         auio.uio_rw = UIO_WRITE;
1981                         auio.uio_resid = bytes;
1982                         auio.uio_iovcnt = 1;
1983                         auio.uio_td = curthread;
1984                         aiov.iov_base = ap->a_target;
1985                         aiov.iov_len = bytes;
1986                         error = hammer2_write_file(nip, &trans,
1987                                                    &auio, IO_APPEND, 0);
1988                         nipdata = &nip->chain->data->ipdata; /* RELOAD */
1989                         /* XXX handle error */
1990                         error = 0;
1991                 }
1992         }
1993         hammer2_inode_unlock_ex(nip);
1994         hammer2_trans_done(&trans);
1995
1996         /*
1997          * Finalize namecache
1998          */
1999         if (error == 0) {
2000                 cache_setunresolved(ap->a_nch);
2001                 cache_setvp(ap->a_nch, *ap->a_vpp);
2002                 /* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
2003         }
2004         return error;
2005 }
2006
2007 /*
2008  * hammer2_vop_nremove { nch, dvp, cred }
2009  */
2010 static
2011 int
2012 hammer2_vop_nremove(struct vop_nremove_args *ap)
2013 {
2014         hammer2_inode_t *dip;
2015         hammer2_mount_t *hmp;
2016         hammer2_trans_t trans;
2017         struct namecache *ncp;
2018         const uint8_t *name;
2019         size_t name_len;
2020         int error;
2021
2022         dip = VTOI(ap->a_dvp);
2023         hmp = dip->hmp;
2024         if (hmp->ronly)
2025                 return(EROFS);
2026
2027         ncp = ap->a_nch->ncp;
2028         name = ncp->nc_name;
2029         name_len = ncp->nc_nlen;
2030         hammer2_trans_init(&trans, hmp);
2031         error = hammer2_unlink_file(&trans, dip, name, name_len, 0, NULL);
2032         hammer2_trans_done(&trans);
2033         if (error == 0) {
2034                 cache_unlink(ap->a_nch);
2035         }
2036         return (error);
2037 }
2038
2039 /*
2040  * hammer2_vop_nrmdir { nch, dvp, cred }
2041  */
2042 static
2043 int
2044 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
2045 {
2046         hammer2_inode_t *dip;
2047         hammer2_mount_t *hmp;
2048         hammer2_trans_t trans;
2049         struct namecache *ncp;
2050         const uint8_t *name;
2051         size_t name_len;
2052         int error;
2053
2054         dip = VTOI(ap->a_dvp);
2055         hmp = dip->hmp;
2056         if (hmp->ronly)
2057                 return(EROFS);
2058
2059         ncp = ap->a_nch->ncp;
2060         name = ncp->nc_name;
2061         name_len = ncp->nc_nlen;
2062
2063         hammer2_trans_init(&trans, hmp);
2064         error = hammer2_unlink_file(&trans, dip, name, name_len, 1, NULL);
2065         hammer2_trans_done(&trans);
2066         if (error == 0) {
2067                 cache_unlink(ap->a_nch);
2068         }
2069         return (error);
2070 }
2071
2072 /*
2073  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
2074  */
2075 static
2076 int
2077 hammer2_vop_nrename(struct vop_nrename_args *ap)
2078 {
2079         struct namecache *fncp;
2080         struct namecache *tncp;
2081         hammer2_inode_t *fdip;
2082         hammer2_inode_t *tdip;
2083         hammer2_inode_t *ip;
2084         hammer2_chain_t *chain;
2085         hammer2_mount_t *hmp;
2086         hammer2_trans_t trans;
2087         const uint8_t *fname;
2088         size_t fname_len;
2089         const uint8_t *tname;
2090         size_t tname_len;
2091         int error;
2092         int hlink;
2093
2094         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
2095                 return(EXDEV);
2096         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
2097                 return(EXDEV);
2098
2099         fdip = VTOI(ap->a_fdvp);        /* source directory */
2100         tdip = VTOI(ap->a_tdvp);        /* target directory */
2101
2102         hmp = fdip->hmp;                /* check read-only filesystem */
2103         if (hmp->ronly)
2104                 return(EROFS);
2105
2106         fncp = ap->a_fnch->ncp;         /* entry name in source */
2107         fname = fncp->nc_name;
2108         fname_len = fncp->nc_nlen;
2109
2110         tncp = ap->a_tnch->ncp;         /* entry name in target */
2111         tname = tncp->nc_name;
2112         tname_len = tncp->nc_nlen;
2113
2114         hammer2_trans_init(&trans, hmp);
2115
2116         /*
2117          * ip is the inode being renamed.  If this is a hardlink then
2118          * ip represents the actual file and not the hardlink marker.
2119          */
2120         ip = VTOI(fncp->nc_vp);
2121         chain = NULL;
2122
2123         /*
2124          * Keep a tight grip on the inode so the temporary unlinking from
2125          * the source location prior to linking to the target location
2126          * does not cause the chain to be destroyed.
2127          *
2128          * NOTE: To avoid deadlocks we cannot lock (ip) while we are
2129          *       unlinking elements from their directories.  Locking
2130          *       the nlinks field does not lock the whole inode.
2131          */
2132         hammer2_inode_ref(ip);
2133
2134         /*
2135          * Remove target if it exists
2136          */
2137         error = hammer2_unlink_file(&trans, tdip, tname, tname_len, -1, NULL);
2138         if (error && error != ENOENT)
2139                 goto done;
2140         cache_setunresolved(ap->a_tnch);
2141
2142         /*
2143          * When renaming a hardlinked file we may have to re-consolidate
2144          * the location of the hardlink target.  Since the element is simply
2145          * being moved, nlinks is not modified in this case.
2146          *
2147          * If ip represents a regular file the consolidation code essentially
2148          * does nothing other than return the locked chain.
2149          *
2150          * The returned chain will be locked.
2151          *
2152          * WARNING!  We do not currently have a local copy of ipdata but
2153          *           we do use one later remember that it must be reloaded
2154          *           on any modification to the inode, including connects.
2155          */
2156         hammer2_inode_lock_ex(ip);
2157         error = hammer2_hardlink_consolidate(&trans, ip, &chain, tdip, 0);
2158         if (error)
2159                 goto done;
2160
2161         /*
2162          * Disconnect (fdip, fname) from the source directory.  This will
2163          * disconnect (ip) if it represents a direct file.  If (ip) represents
2164          * a hardlink the HARDLINK pointer object will be removed but the
2165          * hardlink will stay intact.
2166          *
2167          * The target chain may be marked DELETED but will not be destroyed
2168          * since we retain our hold on ip and chain.
2169          */
2170         error = hammer2_unlink_file(&trans, fdip, fname, fname_len, -1, &hlink);
2171         KKASSERT(error != EAGAIN);
2172         if (error)
2173                 goto done;
2174
2175         /*
2176          * Reconnect ip to target directory using chain.  Chains cannot
2177          * actually be moved, so this will duplicate the chain in the new
2178          * spot and assign it to the ip, replacing the old chain.
2179          *
2180          * WARNING: chain locks can lock buffer cache buffers, to avoid
2181          *          deadlocks we want to unlock before issuing a cache_*()
2182          *          op (that might have to lock a vnode).
2183          */
2184         error = hammer2_inode_connect(&trans, hlink,
2185                                       tdip, &chain,
2186                                       tname, tname_len);
2187         if (error == 0) {
2188                 KKASSERT(chain != NULL);
2189                 hammer2_inode_repoint(ip, (hlink ? ip->pip : tdip), chain);
2190                 cache_rename(ap->a_fnch, ap->a_tnch);
2191         }
2192 done:
2193         if (chain)
2194                 hammer2_chain_unlock(chain);
2195         hammer2_inode_unlock_ex(ip);
2196         hammer2_inode_drop(ip);
2197         hammer2_trans_done(&trans);
2198
2199         return (error);
2200 }
2201
2202 static int hammer2_strategy_read(struct vop_strategy_args *ap);
2203 static int hammer2_strategy_write(struct vop_strategy_args *ap);
2204
2205 static
2206 int
2207 hammer2_vop_strategy(struct vop_strategy_args *ap)
2208 {
2209         struct bio *biop;
2210         struct buf *bp;
2211         int error;
2212
2213         biop = ap->a_bio;
2214         bp = biop->bio_buf;
2215
2216         switch(bp->b_cmd) {
2217         case BUF_CMD_READ:
2218                 error = hammer2_strategy_read(ap);
2219                 ++hammer2_iod_file_read;
2220                 break;
2221         case BUF_CMD_WRITE:
2222                 error = hammer2_strategy_write(ap);
2223                 ++hammer2_iod_file_write;
2224                 break;
2225         default:
2226                 bp->b_error = error = EINVAL;
2227                 bp->b_flags |= B_ERROR;
2228                 biodone(biop);
2229                 break;
2230         }
2231
2232         return (error);
2233 }
2234
2235 static
2236 int
2237 hammer2_strategy_read(struct vop_strategy_args *ap)
2238 {
2239         struct buf *bp;
2240         struct bio *bio;
2241         struct bio *nbio;
2242         hammer2_mount_t *hmp;
2243         hammer2_inode_t *ip;
2244         hammer2_chain_t *parent;
2245         hammer2_chain_t *chain;
2246         hammer2_key_t lbase;
2247
2248         bio = ap->a_bio;
2249         bp = bio->bio_buf;
2250         ip = VTOI(ap->a_vp);
2251         hmp = ip->hmp;
2252         nbio = push_bio(bio);
2253
2254         lbase = bio->bio_offset;
2255         chain = NULL;
2256         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
2257
2258         /*
2259          * We must characterize the logical->physical translation if it
2260          * has not already been cached.
2261          *
2262          * Physical data references < LBUFSIZE are never cached.  This
2263          * includes both small-block allocations and inode-embedded data.
2264          */
2265         if (nbio->bio_offset == NOOFFSET) {
2266                 hammer2_inode_lock_sh(ip);
2267
2268                 parent = hammer2_chain_lookup_init(ip->chain,
2269                                                    HAMMER2_LOOKUP_SHARED);
2270
2271                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
2272                                              HAMMER2_LOOKUP_NODATA |
2273                                              HAMMER2_LOOKUP_SHARED);
2274                 if (chain == NULL) {
2275                         /*
2276                          * Data is zero-fill
2277                          */
2278                         nbio->bio_offset = ZFOFFSET;
2279                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2280                         /*
2281                          * Data is embedded in the inode (do nothing)
2282                          */
2283                         KKASSERT(chain == parent);
2284                         hammer2_chain_unlock(chain);
2285                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2286                         /*
2287                          * Data is on-media
2288                          */
2289                         KKASSERT(bp->b_bcount == chain->bytes);
2290                         nbio->bio_offset = chain->bref.data_off &
2291                                            HAMMER2_OFF_MASK;
2292                         hammer2_chain_unlock(chain);
2293                         KKASSERT(nbio->bio_offset != 0);
2294                 } else {
2295                         panic("hammer2_strategy_read: unknown bref type");
2296                 }
2297                 hammer2_chain_lookup_done(parent);
2298                 hammer2_inode_unlock_sh(ip);
2299         }
2300
2301         if (hammer2_debug & 0x0020) {
2302                 kprintf("read %016jx %016jx\n",
2303                         bio->bio_offset, nbio->bio_offset);
2304         }
2305
2306         if (nbio->bio_offset == ZFOFFSET) {
2307                 /*
2308                  * Data is zero-fill
2309                  */
2310                 bp->b_resid = 0;
2311                 bp->b_error = 0;
2312                 bzero(bp->b_data, bp->b_bcount);
2313                 biodone(nbio);
2314         } else if (nbio->bio_offset != NOOFFSET) {
2315                 /*
2316                  * Forward direct IO to the device
2317                  */
2318                 vn_strategy(hmp->devvp, nbio);
2319         } else {
2320                 /*
2321                  * Data is embedded in inode.
2322                  */
2323                 bcopy(chain->data->ipdata.u.data, bp->b_data,
2324                       HAMMER2_EMBEDDED_BYTES);
2325                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
2326                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
2327                 bp->b_resid = 0;
2328                 bp->b_error = 0;
2329                 biodone(nbio);
2330         }
2331         return (0);
2332 }
2333
2334 static
2335 int
2336 hammer2_strategy_write(struct vop_strategy_args *ap)
2337 {
2338         struct buf *bp;
2339         struct bio *bio;
2340         struct bio *nbio;
2341         hammer2_chain_t *chain;
2342         hammer2_mount_t *hmp;
2343         hammer2_inode_t *ip;
2344
2345         bio = ap->a_bio;
2346         bp = bio->bio_buf;
2347         ip = VTOI(ap->a_vp);
2348         hmp = ip->hmp;
2349         nbio = push_bio(bio);
2350
2351         KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
2352         KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
2353
2354         if (nbio->bio_offset == NOOFFSET) {
2355                 /*
2356                  * The data is embedded in the inode.  Note that strategy
2357                  * calls for embedded data are synchronous in order to
2358                  * ensure that ip->chain is stable.
2359                  */
2360                 KKASSERT(bio->bio_offset == 0);
2361                 KKASSERT(ip->chain && ip->chain->data);
2362                 chain = ip->chain;
2363                 bcopy(bp->b_data, chain->data->ipdata.u.data,
2364                       HAMMER2_EMBEDDED_BYTES);
2365                 bp->b_resid = 0;
2366                 bp->b_error = 0;
2367                 biodone(nbio);
2368
2369                 /*
2370                  * This special flag does not follow the normal MODIFY rules
2371                  * because we might deadlock on ip.  Instead we depend on
2372                  * VOP_FSYNC() to detect the case.
2373                  */
2374                 atomic_set_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
2375         } else {
2376                 /*
2377                  * Forward direct IO to the device
2378                  */
2379                 vn_strategy(hmp->devvp, nbio);
2380         }
2381         return (0);
2382 }
2383
2384 /*
2385  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2386  */
2387 static
2388 int
2389 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2390 {
2391         hammer2_mount_t *hmp;
2392         hammer2_inode_t *ip;
2393         int error;
2394
2395         ip = VTOI(ap->a_vp);
2396         hmp = ip->hmp;
2397
2398         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2399                               ap->a_fflag, ap->a_cred);
2400         return (error);
2401 }
2402
2403 static
2404 int 
2405 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2406 {
2407         struct mount *mp;
2408         hammer2_pfsmount_t *pmp;
2409         int rc;
2410
2411         switch (ap->a_op) {
2412         case (MOUNTCTL_SET_EXPORT):
2413                 mp = ap->a_head.a_ops->head.vv_mount;
2414                 pmp = MPTOPMP(mp);
2415
2416                 if (ap->a_ctllen != sizeof(struct export_args))
2417                         rc = (EINVAL);
2418                 else
2419                         rc = vfs_export(mp, &pmp->export,
2420                                         (const struct export_args *)ap->a_ctl);
2421                 break;
2422         default:
2423                 rc = vop_stdmountctl(ap);
2424                 break;
2425         }
2426         return (rc);
2427 }
2428
2429 struct vop_ops hammer2_vnode_vops = {
2430         .vop_default    = vop_defaultop,
2431         .vop_fsync      = hammer2_vop_fsync,
2432         .vop_getpages   = vop_stdgetpages,
2433         .vop_putpages   = vop_stdputpages,
2434         .vop_access     = hammer2_vop_access,
2435         .vop_advlock    = hammer2_vop_advlock,
2436         .vop_close      = hammer2_vop_close,
2437         .vop_nlink      = hammer2_vop_nlink,
2438         .vop_ncreate    = hammer2_vop_ncreate,
2439         .vop_nsymlink   = hammer2_vop_nsymlink,
2440         .vop_nremove    = hammer2_vop_nremove,
2441         .vop_nrmdir     = hammer2_vop_nrmdir,
2442         .vop_nrename    = hammer2_vop_nrename,
2443         .vop_getattr    = hammer2_vop_getattr,
2444         .vop_setattr    = hammer2_vop_setattr,
2445         .vop_readdir    = hammer2_vop_readdir,
2446         .vop_readlink   = hammer2_vop_readlink,
2447         .vop_getpages   = vop_stdgetpages,
2448         .vop_putpages   = vop_stdputpages,
2449         .vop_read       = hammer2_vop_read,
2450         .vop_write      = hammer2_vop_write,
2451         .vop_open       = hammer2_vop_open,
2452         .vop_inactive   = hammer2_vop_inactive,
2453         .vop_reclaim    = hammer2_vop_reclaim,
2454         .vop_nresolve   = hammer2_vop_nresolve,
2455         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2456         .vop_nmkdir     = hammer2_vop_nmkdir,
2457         .vop_ioctl      = hammer2_vop_ioctl,
2458         .vop_mountctl   = hammer2_vop_mountctl,
2459         .vop_bmap       = hammer2_vop_bmap,
2460         .vop_strategy   = hammer2_vop_strategy,
2461 };
2462
2463 struct vop_ops hammer2_spec_vops = {
2464
2465 };
2466
2467 struct vop_ops hammer2_fifo_vops = {
2468
2469 };