hammer2 - flush sequencing part 1
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * Kernel Filesystem interface
37  *
38  * NOTE! local ipdata pointers must be reloaded on any modifying operation
39  *       to the inode as its underlying chain may have changed.
40  */
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/proc.h>
48 #include <sys/namei.h>
49 #include <sys/mount.h>
50 #include <sys/vnode.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include "hammer2.h"
56
57 #define ZFOFFSET        (-2LL)
58
59 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
60                                 int seqcount);
61 static int hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
62                                 struct uio *uio, int ioflag, int seqcount);
63 static hammer2_off_t hammer2_assign_physical(hammer2_trans_t *trans,
64                                 hammer2_inode_t *ip,
65                                 hammer2_key_t lbase, int lblksize,
66                                 int *errorp);
67 static void hammer2_extend_file(hammer2_trans_t *trans,
68                                 hammer2_inode_t *ip, hammer2_key_t nsize);
69 static void hammer2_truncate_file(hammer2_trans_t *trans,
70                                 hammer2_inode_t *ip, hammer2_key_t nsize);
71
72 static __inline
73 void
74 hammer2_knote(struct vnode *vp, int flags)
75 {
76         if (flags)
77                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
78 }
79
80 /*
81  * Last reference to a vnode is going away but it is still cached.
82  */
83 static
84 int
85 hammer2_vop_inactive(struct vop_inactive_args *ap)
86 {
87         hammer2_inode_t *ip;
88         struct vnode *vp;
89 #if 0
90         hammer2_trans_t trans;
91         struct hammer2_mount *hmp;
92 #endif
93
94         vp = ap->a_vp;
95         ip = VTOI(vp);
96
97         /*
98          * Degenerate case
99          */
100         if (ip == NULL) {
101                 vrecycle(vp);
102                 return (0);
103         }
104
105         /*
106          * Detect updates to the embedded data which may be synchronized by
107          * the strategy code.  Simply mark the inode modified so it gets
108          * picked up by our normal flush.
109          */
110         hammer2_inode_lock_ex(ip);
111         KKASSERT(ip->chain);
112 #if 0
113         /* XXX lock order reversal on inode/trans */
114         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
115                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
116                 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
117                 hammer2_trans_init(ip->hmp, &trans, 0);
118                 hammer2_chain_modify(&trans, ip->chain, 0);
119                 hammer2_trans_done(&trans);
120         }
121 #endif
122
123         /*
124          * Check for deleted inodes and recycle immediately.
125          */
126         if (ip->chain->flags & HAMMER2_CHAIN_DELETED) {
127                 hammer2_inode_unlock_ex(ip);
128                 vrecycle(vp);
129         } else {
130                 hammer2_inode_unlock_ex(ip);
131         }
132         return (0);
133 }
134
135 /*
136  * Reclaim a vnode so that it can be reused; after the inode is
137  * disassociated, the filesystem must manage it alone.
138  */
139 static
140 int
141 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
142 {
143         hammer2_chain_t *chain;
144         hammer2_inode_t *ip;
145         hammer2_mount_t *hmp;
146 #if 0
147         hammer2_trans_t trans;
148 #endif
149         struct vnode *vp;
150
151         vp = ap->a_vp;
152         ip = VTOI(vp);
153         if (ip == NULL)
154                 return(0);
155         hmp = ip->hmp;
156
157         /*
158          * Set SUBMODIFIED so we can detect and propagate the DESTROYED
159          * bit in the flush code.
160          *
161          * ip->chain might be stale, correct it before checking as older
162          * versions of the chain are likely marked deleted even if the
163          * file hasn't been.  XXX ip->chain should never be stale on
164          * reclaim.
165          */
166         hammer2_inode_lock_ex(ip);
167         chain = ip->chain;
168         if (chain->duplink)
169                 kprintf("RECLAIM DUPLINKED IP: %p %p\n", ip, ip->chain);
170 #if 0
171         while (chain->duplink)
172                 chain = chain->duplink;
173         if (ip->chain != chain) {
174                 hammer2_inode_repoint(ip, ip->pip, chain);
175                 chain = ip->chain;
176         }
177 #endif
178
179         /*
180          * The final close of a deleted file or directory marks it for
181          * destruction.  The DESTROYED flag allows the flusher to shortcut
182          * any modified blocks still unflushed (that is, just ignore them).
183          *
184          * HAMMER2 usually does not try to optimize the freemap by returning
185          * deleted blocks to it as it does not usually know how many snapshots
186          * might be referencing portions of the file/dir.  XXX TODO.
187          *
188          * XXX TODO - However, any modified file as-of when a snapshot is made
189          *            cannot use this optimization as some of the modifications
190          *            may wind up being part of the snapshot.
191          */
192         vp->v_data = NULL;
193         ip->vp = NULL;
194         if (chain->flags & HAMMER2_CHAIN_DELETED) {
195                 KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED);
196                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROYED |
197                                               HAMMER2_CHAIN_SUBMODIFIED);
198         }
199 #if 0
200         /*
201          * XXX chains will be flushed on sync, no need to do it here.
202          */
203         if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
204                             HAMMER2_CHAIN_DELETED |
205                             HAMMER2_CHAIN_SUBMODIFIED)) {
206                 hammer2_trans_init(ip->hmp, &trans, HAMMER2_TRANS_ISFLUSH);
207                 hammer2_chain_flush(&trans, chain);
208                 hammer2_trans_done(&trans);
209         }
210 #endif
211         if (ip->refs > 2)                           /* (our lock + vp ref) */
212                 hammer2_inode_unlock_ex(ip);        /* unlock */
213         else
214                 hammer2_inode_put(ip);              /* unlock & disconnect */
215         /* chain no longer referenced */
216         /* chain = NULL; not needed */
217         hammer2_inode_drop(ip);                     /* vp ref */
218
219         /*
220          * XXX handle background sync when ip dirty, kernel will no longer
221          * notify us regarding this inode because there is no longer a
222          * vnode attached to it.
223          */
224
225         return (0);
226 }
227
228 static
229 int
230 hammer2_vop_fsync(struct vop_fsync_args *ap)
231 {
232         hammer2_mount_t *hmp;
233         hammer2_inode_t *ip;
234         hammer2_trans_t trans;
235         struct vnode *vp;
236
237         vp = ap->a_vp;
238         ip = VTOI(vp);
239         hmp = ip->hmp;
240
241         hammer2_trans_init(hmp, &trans, HAMMER2_TRANS_ISFLUSH);
242         hammer2_inode_lock_ex(ip);
243
244         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
245
246         /*
247          * Detect updates to the embedded data which may be synchronized by
248          * the strategy code.  Simply mark the inode modified so it gets
249          * picked up by our normal flush.
250          */
251         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
252                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
253                 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
254                 hammer2_chain_modify(&trans, ip->chain, 0);
255         }
256
257         /*
258          * Calling chain_flush here creates a lot of duplicative
259          * COW operations due to non-optimal vnode ordering.
260          *
261          * Only do it for an actual fsync() syscall.  The other forms
262          * which call this function will eventually call chain_flush
263          * on the volume root as a catch-all, which is far more optimal.
264          */
265         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
266                 atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
267                 hammer2_chain_flush(&trans, ip->chain);
268         }
269         hammer2_inode_unlock_ex(ip);
270         hammer2_trans_done(&trans);
271
272         return (0);
273 }
274
275 static
276 int
277 hammer2_vop_access(struct vop_access_args *ap)
278 {
279         hammer2_inode_t *ip = VTOI(ap->a_vp);
280         hammer2_inode_data_t *ipdata;
281         uid_t uid;
282         gid_t gid;
283         int error;
284
285         hammer2_inode_lock_sh(ip);
286         ipdata = &ip->chain->data->ipdata;
287         uid = hammer2_to_unix_xid(&ipdata->uid);
288         gid = hammer2_to_unix_xid(&ipdata->gid);
289         error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
290         hammer2_inode_unlock_sh(ip);
291
292         return (error);
293 }
294
295 static
296 int
297 hammer2_vop_getattr(struct vop_getattr_args *ap)
298 {
299         hammer2_inode_data_t *ipdata;
300         hammer2_pfsmount_t *pmp;
301         hammer2_inode_t *ip;
302         struct vnode *vp;
303         struct vattr *vap;
304
305         vp = ap->a_vp;
306         vap = ap->a_vap;
307
308         ip = VTOI(vp);
309         pmp = ip->pmp;
310
311         hammer2_inode_lock_sh(ip);
312         ipdata = &ip->chain->data->ipdata;
313
314         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
315         vap->va_fileid = ipdata->inum;
316         vap->va_mode = ipdata->mode;
317         vap->va_nlink = ipdata->nlinks;
318         vap->va_uid = hammer2_to_unix_xid(&ipdata->uid);
319         vap->va_gid = hammer2_to_unix_xid(&ipdata->gid);
320         vap->va_rmajor = 0;
321         vap->va_rminor = 0;
322         vap->va_size = ipdata->size;
323         vap->va_blocksize = HAMMER2_PBUFSIZE;
324         vap->va_flags = ipdata->uflags;
325         hammer2_time_to_timespec(ipdata->ctime, &vap->va_ctime);
326         hammer2_time_to_timespec(ipdata->mtime, &vap->va_mtime);
327         hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
328         vap->va_gen = 1;
329         vap->va_bytes = vap->va_size;   /* XXX */
330         vap->va_type = hammer2_get_vtype(ip->chain);
331         vap->va_filerev = 0;
332         vap->va_uid_uuid = ipdata->uid;
333         vap->va_gid_uuid = ipdata->gid;
334         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
335                           VA_FSID_UUID_VALID;
336
337         hammer2_inode_unlock_sh(ip);
338
339         return (0);
340 }
341
342 static
343 int
344 hammer2_vop_setattr(struct vop_setattr_args *ap)
345 {
346         hammer2_inode_data_t *ipdata;
347         hammer2_inode_t *ip;
348         hammer2_mount_t *hmp;
349         hammer2_trans_t trans;
350         struct vnode *vp;
351         struct vattr *vap;
352         int error;
353         int kflags = 0;
354         int domtime = 0;
355         uint64_t ctime;
356
357         vp = ap->a_vp;
358         vap = ap->a_vap;
359         hammer2_update_time(&ctime);
360
361         ip = VTOI(vp);
362         hmp = ip->hmp;
363
364         if (hmp->ronly)
365                 return(EROFS);
366
367         hammer2_trans_init(hmp, &trans, 0);
368         hammer2_inode_lock_ex(ip);
369         ipdata = &ip->chain->data->ipdata;
370         error = 0;
371
372         if (vap->va_flags != VNOVAL) {
373                 u_int32_t flags;
374
375                 flags = ipdata->uflags;
376                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
377                                          hammer2_to_unix_xid(&ipdata->uid),
378                                          ap->a_cred);
379                 if (error == 0) {
380                         if (ipdata->uflags != flags) {
381                                 hammer2_chain_modify(&trans, ip->chain, 0);
382                                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
383                                 ipdata->uflags = flags;
384                                 ipdata->ctime = ctime;
385                                 kflags |= NOTE_ATTRIB;
386                         }
387                         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
388                                 error = 0;
389                                 goto done;
390                         }
391                 }
392                 goto done;
393         }
394         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
395                 error = EPERM;
396                 goto done;
397         }
398         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
399                 mode_t cur_mode = ipdata->mode;
400                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
401                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
402                 uuid_t uuid_uid;
403                 uuid_t uuid_gid;
404
405                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
406                                          ap->a_cred,
407                                          &cur_uid, &cur_gid, &cur_mode);
408                 if (error == 0) {
409                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
410                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
411                         if (bcmp(&uuid_uid, &ipdata->uid, sizeof(uuid_uid)) ||
412                             bcmp(&uuid_gid, &ipdata->gid, sizeof(uuid_gid)) ||
413                             ipdata->mode != cur_mode
414                         ) {
415                                 hammer2_chain_modify(&trans, ip->chain, 0);
416                                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
417                                 ipdata->uid = uuid_uid;
418                                 ipdata->gid = uuid_gid;
419                                 ipdata->mode = cur_mode;
420                                 ipdata->ctime = ctime;
421                         }
422                         kflags |= NOTE_ATTRIB;
423                 }
424         }
425
426         /*
427          * Resize the file
428          */
429         if (vap->va_size != VNOVAL && ipdata->size != vap->va_size) {
430                 switch(vp->v_type) {
431                 case VREG:
432                         if (vap->va_size == ipdata->size)
433                                 break;
434                         if (vap->va_size < ipdata->size) {
435                                 hammer2_truncate_file(&trans, ip, vap->va_size);
436                         } else {
437                                 hammer2_extend_file(&trans, ip, vap->va_size);
438                         }
439                         ipdata = &ip->chain->data->ipdata; /* RELOAD */
440                         domtime = 1;
441                         break;
442                 default:
443                         error = EINVAL;
444                         goto done;
445                 }
446         }
447 #if 0
448         /* atime not supported */
449         if (vap->va_atime.tv_sec != VNOVAL) {
450                 hammer2_chain_modify(&trans, ip->chain, 0);
451                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
452                 ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
453                 kflags |= NOTE_ATTRIB;
454         }
455 #endif
456         if (vap->va_mtime.tv_sec != VNOVAL) {
457                 hammer2_chain_modify(&trans, ip->chain, 0);
458                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
459                 ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
460                 kflags |= NOTE_ATTRIB;
461         }
462         if (vap->va_mode != (mode_t)VNOVAL) {
463                 mode_t cur_mode = ipdata->mode;
464                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
465                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
466
467                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
468                                          cur_uid, cur_gid, &cur_mode);
469                 if (error == 0 && ipdata->mode != cur_mode) {
470                         hammer2_chain_modify(&trans, ip->chain, 0);
471                         ipdata = &ip->chain->data->ipdata; /* RELOAD */
472                         ipdata->mode = cur_mode;
473                         ipdata->ctime = ctime;
474                         kflags |= NOTE_ATTRIB;
475                 }
476         }
477 done:
478         hammer2_inode_unlock_ex(ip);
479         hammer2_trans_done(&trans);
480         return (error);
481 }
482
483 static
484 int
485 hammer2_vop_readdir(struct vop_readdir_args *ap)
486 {
487         hammer2_inode_data_t *ipdata;
488         hammer2_mount_t *hmp;
489         hammer2_inode_t *ip;
490         hammer2_inode_t *xip;
491         hammer2_chain_t *parent;
492         hammer2_chain_t *chain;
493         hammer2_tid_t inum;
494         hammer2_key_t lkey;
495         struct uio *uio;
496         off_t *cookies;
497         off_t saveoff;
498         int cookie_index;
499         int ncookies;
500         int error;
501         int dtype;
502         int r;
503
504         ip = VTOI(ap->a_vp);
505         hmp = ip->hmp;
506         uio = ap->a_uio;
507         saveoff = uio->uio_offset;
508
509         /*
510          * Setup cookies directory entry cookies if requested
511          */
512         if (ap->a_ncookies) {
513                 ncookies = uio->uio_resid / 16 + 1;
514                 if (ncookies > 1024)
515                         ncookies = 1024;
516                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
517         } else {
518                 ncookies = -1;
519                 cookies = NULL;
520         }
521         cookie_index = 0;
522
523         hammer2_inode_lock_sh(ip);
524         ipdata = &ip->chain->data->ipdata;
525
526         /*
527          * Handle artificial entries.  To ensure that only positive 64 bit
528          * quantities are returned to userland we always strip off bit 63.
529          * The hash code is designed such that codes 0x0000-0x7FFF are not
530          * used, allowing us to use these codes for articial entries.
531          *
532          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
533          * allow '..' to cross the mount point into (e.g.) the super-root.
534          */
535         error = 0;
536         chain = (void *)(intptr_t)-1;   /* non-NULL for early goto done case */
537
538         if (saveoff == 0) {
539                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
540                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
541                 if (r)
542                         goto done;
543                 if (cookies)
544                         cookies[cookie_index] = saveoff;
545                 ++saveoff;
546                 ++cookie_index;
547                 if (cookie_index == ncookies)
548                         goto done;
549         }
550
551         if (saveoff == 1) {
552                 /*
553                  * Be careful with lockorder when accessing ".."
554                  *
555                  * (ip is the current dir. xip is the parent dir).
556                  */
557                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
558                 while (ip->pip != NULL && ip != ip->pmp->iroot) {
559                         xip = ip->pip;
560                         hammer2_inode_ref(xip);
561                         hammer2_inode_unlock_sh(ip);
562                         hammer2_inode_lock_sh(xip);
563                         hammer2_inode_lock_sh(ip);
564                         hammer2_inode_drop(xip);
565                         if (xip == ip->pip) {
566                                 inum = xip->chain->data->ipdata.inum &
567                                        HAMMER2_DIRHASH_USERMSK;
568                                 hammer2_inode_unlock_sh(xip);
569                                 break;
570                         }
571                         hammer2_inode_unlock_sh(xip);
572                 }
573                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
574                 if (r)
575                         goto done;
576                 if (cookies)
577                         cookies[cookie_index] = saveoff;
578                 ++saveoff;
579                 ++cookie_index;
580                 if (cookie_index == ncookies)
581                         goto done;
582         }
583
584         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
585
586         /*
587          * parent is the inode chain, already locked for us.  Don't
588          * double lock shared locks as this will screw up upgrades.
589          */
590         if (error) {
591                 goto done;
592         }
593         parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
594         chain = hammer2_chain_lookup(&parent, lkey, lkey,
595                                      HAMMER2_LOOKUP_SHARED);
596         if (chain == NULL) {
597                 chain = hammer2_chain_lookup(&parent,
598                                              lkey, (hammer2_key_t)-1,
599                                              HAMMER2_LOOKUP_SHARED);
600         }
601         while (chain) {
602                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
603                         dtype = hammer2_get_dtype(chain);
604                         saveoff = chain->bref.key & HAMMER2_DIRHASH_USERMSK;
605                         r = vop_write_dirent(&error, uio,
606                                              chain->data->ipdata.inum &
607                                               HAMMER2_DIRHASH_USERMSK,
608                                              dtype,
609                                              chain->data->ipdata.name_len,
610                                              chain->data->ipdata.filename);
611                         if (r)
612                                 break;
613                         if (cookies)
614                                 cookies[cookie_index] = saveoff;
615                         ++cookie_index;
616                 } else {
617                         /* XXX chain error */
618                         kprintf("bad chain type readdir %d\n",
619                                 chain->bref.type);
620                 }
621
622                 /*
623                  * Keys may not be returned in order so once we have a
624                  * placemarker (chain) the scan must allow the full range
625                  * or some entries will be missed.
626                  */
627                 chain = hammer2_chain_next(&parent, chain,
628                                            HAMMER2_DIRHASH_VISIBLE,
629                                            (hammer2_key_t)-1,
630                                            HAMMER2_LOOKUP_SHARED);
631                 if (chain) {
632                         saveoff = (chain->bref.key &
633                                    HAMMER2_DIRHASH_USERMSK) + 1;
634                 } else {
635                         saveoff = (hammer2_key_t)-1;
636                 }
637                 if (cookie_index == ncookies)
638                         break;
639         }
640         if (chain)
641                 hammer2_chain_unlock(chain);
642         hammer2_chain_lookup_done(parent);
643 done:
644         hammer2_inode_unlock_sh(ip);
645         if (ap->a_eofflag)
646                 *ap->a_eofflag = (chain == NULL);
647         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
648         if (error && cookie_index == 0) {
649                 if (cookies) {
650                         kfree(cookies, M_TEMP);
651                         *ap->a_ncookies = 0;
652                         *ap->a_cookies = NULL;
653                 }
654         } else {
655                 if (cookies) {
656                         *ap->a_ncookies = cookie_index;
657                         *ap->a_cookies = cookies;
658                 }
659         }
660         return (error);
661 }
662
663 /*
664  * hammer2_vop_readlink { vp, uio, cred }
665  */
666 static
667 int
668 hammer2_vop_readlink(struct vop_readlink_args *ap)
669 {
670         struct vnode *vp;
671         hammer2_mount_t *hmp;
672         hammer2_inode_t *ip;
673         int error;
674
675         vp = ap->a_vp;
676         if (vp->v_type != VLNK)
677                 return (EINVAL);
678         ip = VTOI(vp);
679         hmp = ip->hmp;
680
681         error = hammer2_read_file(ip, ap->a_uio, 0);
682         return (error);
683 }
684
685 static
686 int
687 hammer2_vop_read(struct vop_read_args *ap)
688 {
689         struct vnode *vp;
690         hammer2_mount_t *hmp;
691         hammer2_inode_t *ip;
692         struct uio *uio;
693         int error;
694         int seqcount;
695         int bigread;
696
697         /*
698          * Read operations supported on this vnode?
699          */
700         vp = ap->a_vp;
701         if (vp->v_type != VREG)
702                 return (EINVAL);
703
704         /*
705          * Misc
706          */
707         ip = VTOI(vp);
708         hmp = ip->hmp;
709         uio = ap->a_uio;
710         error = 0;
711
712         seqcount = ap->a_ioflag >> 16;
713         bigread = (uio->uio_resid > 100 * 1024 * 1024);
714
715         error = hammer2_read_file(ip, uio, seqcount);
716         return (error);
717 }
718
719 static
720 int
721 hammer2_vop_write(struct vop_write_args *ap)
722 {
723         hammer2_mount_t *hmp;
724         hammer2_inode_t *ip;
725         hammer2_trans_t trans;
726         thread_t td;
727         struct vnode *vp;
728         struct uio *uio;
729         int error;
730         int seqcount;
731         int bigwrite;
732
733         /*
734          * Read operations supported on this vnode?
735          */
736         vp = ap->a_vp;
737         if (vp->v_type != VREG)
738                 return (EINVAL);
739
740         /*
741          * Misc
742          */
743         ip = VTOI(vp);
744         hmp = ip->hmp;
745         uio = ap->a_uio;
746         error = 0;
747         if (hmp->ronly)
748                 return (EROFS);
749
750         seqcount = ap->a_ioflag >> 16;
751         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
752
753         /*
754          * Check resource limit
755          */
756         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
757             uio->uio_offset + uio->uio_resid >
758              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
759                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
760                 return (EFBIG);
761         }
762
763         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
764
765         /*
766          * ip must be locked if extending the file.
767          * ip must be locked to avoid racing a truncation.
768          *
769          * ip must be marked modified, particularly because the write
770          * might wind up being copied into the embedded data area.
771          */
772         hammer2_trans_init(ip->hmp, &trans, 0);
773         hammer2_inode_lock_ex(ip);
774         error = hammer2_write_file(ip, &trans, uio, ap->a_ioflag, seqcount);
775         hammer2_inode_unlock_ex(ip);
776         hammer2_trans_done(&trans);
777
778         return (error);
779 }
780
781 /*
782  * Perform read operations on a file or symlink given an UNLOCKED
783  * inode and uio.
784  *
785  * The passed ip is not locked.
786  */
787 static
788 int
789 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
790 {
791         hammer2_off_t size;
792         struct buf *bp;
793         int error;
794
795         error = 0;
796
797         /*
798          * UIO read loop.
799          */
800         hammer2_inode_lock_sh(ip);
801         size = ip->chain->data->ipdata.size;
802
803         while (uio->uio_resid > 0 && uio->uio_offset < size) {
804                 hammer2_key_t lbase;
805                 hammer2_key_t leof;
806                 int lblksize;
807                 int loff;
808                 int n;
809
810                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
811                                                 &lbase, &leof);
812
813                 error = cluster_read(ip->vp, leof, lbase, lblksize,
814                                      uio->uio_resid, seqcount * BKVASIZE,
815                                      &bp);
816
817                 if (error)
818                         break;
819                 loff = (int)(uio->uio_offset - lbase);
820                 n = lblksize - loff;
821                 if (n > uio->uio_resid)
822                         n = uio->uio_resid;
823                 if (n > size - uio->uio_offset)
824                         n = (int)(size - uio->uio_offset);
825                 bp->b_flags |= B_AGE;
826                 uiomove((char *)bp->b_data + loff, n, uio);
827                 bqrelse(bp);
828         }
829         hammer2_inode_unlock_sh(ip);
830         return (error);
831 }
832
833 /*
834  * Called with a locked (ip) to do the underlying write to a file or
835  * to build the symlink target.
836  */
837 static
838 int
839 hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
840                    struct uio *uio, int ioflag, int seqcount)
841 {
842         hammer2_inode_data_t *ipdata;
843         hammer2_key_t old_eof;
844         struct buf *bp;
845         int kflags;
846         int error;
847         int modified = 0;
848
849         /*
850          * Setup if append
851          */
852         ipdata = &ip->chain->data->ipdata;
853         if (ioflag & IO_APPEND)
854                 uio->uio_offset = ipdata->size;
855         kflags = 0;
856         error = 0;
857
858         /*
859          * Extend the file if necessary.  If the write fails at some point
860          * we will truncate it back down to cover as much as we were able
861          * to write.
862          *
863          * Doing this now makes it easier to calculate buffer sizes in
864          * the loop.
865          */
866         KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
867         old_eof = ipdata->size;
868         if (uio->uio_offset + uio->uio_resid > ipdata->size) {
869                 modified = 1;
870                 hammer2_extend_file(trans, ip,
871                                     uio->uio_offset + uio->uio_resid);
872                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
873                 kflags |= NOTE_EXTEND;
874         }
875         KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
876
877         /*
878          * UIO write loop
879          */
880         while (uio->uio_resid > 0) {
881                 hammer2_key_t lbase;
882                 hammer2_key_t leof;
883                 int trivial;
884                 int lblksize;
885                 int loff;
886                 int n;
887
888                 /*
889                  * Don't allow the buffer build to blow out the buffer
890                  * cache.
891                  */
892                 if ((ioflag & IO_RECURSE) == 0) {
893                         /*
894                          * XXX should try to leave this unlocked through
895                          *      the whole loop
896                          */
897                         hammer2_inode_unlock_ex(ip);
898                         bwillwrite(HAMMER2_PBUFSIZE);
899                         hammer2_inode_lock_ex(ip);
900                         ipdata = &ip->chain->data->ipdata;      /* reload */
901                 }
902
903                 /* XXX bigwrite & signal check test */
904
905                 /*
906                  * This nominally tells us how much we can cluster and
907                  * what the logical buffer size needs to be.  Currently
908                  * we don't try to cluster the write and just handle one
909                  * block at a time.
910                  */
911                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
912                                                 &lbase, &leof);
913                 loff = (int)(uio->uio_offset - lbase);
914
915                 /*
916                  * Calculate bytes to copy this transfer and whether the
917                  * copy completely covers the buffer or not.
918                  */
919                 trivial = 0;
920                 n = lblksize - loff;
921                 if (n > uio->uio_resid) {
922                         n = uio->uio_resid;
923                         if (uio->uio_offset + n == ipdata->size)
924                                 trivial = 1;
925                 } else if (loff == 0) {
926                         trivial = 1;
927                 }
928
929                 /*
930                  * Get the buffer
931                  */
932                 if (uio->uio_segflg == UIO_NOCOPY) {
933                         /*
934                          * Issuing a write with the same data backing the
935                          * buffer.  Instantiate the buffer to collect the
936                          * backing vm pages, then read-in any missing bits.
937                          *
938                          * This case is used by vop_stdputpages().
939                          */
940                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
941                         if ((bp->b_flags & B_CACHE) == 0) {
942                                 bqrelse(bp);
943                                 error = bread(ip->vp, lbase, lblksize, &bp);
944                         }
945                 } else if (trivial) {
946                         /*
947                          * Even though we are entirely overwriting the buffer
948                          * we may still have to zero it out to avoid a
949                          * mmap/write visibility issue.
950                          */
951                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
952                         if ((bp->b_flags & B_CACHE) == 0)
953                                 vfs_bio_clrbuf(bp);
954                 } else {
955                         /*
956                          * Partial overwrite, read in any missing bits then
957                          * replace the portion being written.
958                          *
959                          * (The strategy code will detect zero-fill physical
960                          * blocks for this case).
961                          */
962                         error = bread(ip->vp, lbase, lblksize, &bp);
963                         if (error == 0)
964                                 bheavy(bp);
965                 }
966
967                 if (error) {
968                         brelse(bp);
969                         break;
970                 }
971
972                 /*
973                  * We have to assign physical storage to the buffer we intend
974                  * to dirty or write now to avoid deadlocks in the strategy
975                  * code later.
976                  *
977                  * This can return NOOFFSET for inode-embedded data.  The
978                  * strategy code will take care of it in that case.
979                  */
980                 bp->b_bio2.bio_offset =
981                         hammer2_assign_physical(trans, ip,
982                                                 lbase, lblksize, &error);
983                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
984                 if (error) {
985                         brelse(bp);
986                         break;
987                 }
988
989                 /*
990                  * Ok, copy the data in
991                  */
992                 hammer2_inode_unlock_ex(ip);
993                 error = uiomove(bp->b_data + loff, n, uio);
994                 hammer2_inode_lock_ex(ip);
995                 ipdata = &ip->chain->data->ipdata;      /* reload */
996                 kflags |= NOTE_WRITE;
997                 modified = 1;
998
999                 if (error) {
1000                         brelse(bp);
1001                         break;
1002                 }
1003
1004                 /* XXX update ip_data.mtime */
1005
1006                 /*
1007                  * Once we dirty a buffer any cached offset becomes invalid.
1008                  *
1009                  * NOTE: For cluster_write() always use the trailing block
1010                  *       size, which is HAMMER2_PBUFSIZE.  lblksize is the
1011                  *       eof-straddling blocksize and is incorrect.
1012                  */
1013                 bp->b_flags |= B_AGE;
1014                 if ((ioflag & IO_SYNC) ||
1015                     (lbase == 0 && (ipdata->op_flags &
1016                                     HAMMER2_OPFLAG_DIRECTDATA))) {
1017                         /*
1018                          * Synchronous I/O requested or writing to the
1019                          * inode's embedded data (which must be synchronous).
1020                          */
1021                         bwrite(bp);
1022                 } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
1023                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1024                                 bp->b_flags |= B_CLUSTEROK;
1025                         bdwrite(bp);
1026                 } else if (ioflag & IO_ASYNC) {
1027                         bawrite(bp);
1028                 } else if (hammer2_cluster_enable) {
1029                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1030                                 bp->b_flags |= B_CLUSTEROK;
1031                         cluster_write(bp, leof, HAMMER2_PBUFSIZE, seqcount);
1032                 } else {
1033                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1034                                 bp->b_flags |= B_CLUSTEROK;
1035                         bdwrite(bp);
1036                 }
1037         }
1038
1039         /*
1040          * Cleanup.  If we extended the file EOF but failed to write through
1041          * the entire write is a failure and we have to back-up.
1042          */
1043         if (error && ipdata->size != old_eof) {
1044                 hammer2_truncate_file(trans, ip, old_eof);
1045                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1046         } else if (modified) {
1047                 hammer2_chain_modify(trans, ip->chain, 0);
1048                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1049                 hammer2_update_time(&ipdata->mtime);
1050         }
1051         hammer2_knote(ip->vp, kflags);
1052
1053         return error;
1054 }
1055
1056 /*
1057  * Assign physical storage to a logical block.  This function creates the
1058  * related meta-data chains representing the data blocks and marks them
1059  * MODIFIED.  We could mark them MOVED instead but ultimately I need to
1060  * XXX code the flusher to check that the related logical buffer is
1061  * flushed.
1062  *
1063  * NOOFFSET is returned if the data is inode-embedded.  In this case the
1064  * strategy code will simply bcopy() the data into the inode.
1065  *
1066  * The inode's delta_dcount is adjusted.
1067  */
1068 static
1069 hammer2_off_t
1070 hammer2_assign_physical(hammer2_trans_t *trans, hammer2_inode_t *ip,
1071                         hammer2_key_t lbase, int lblksize, int *errorp)
1072 {
1073         hammer2_mount_t *hmp;
1074         hammer2_chain_t *parent;
1075         hammer2_chain_t *chain;
1076         hammer2_off_t pbase;
1077
1078         /*
1079          * Locate the chain associated with lbase, return a locked chain.
1080          * However, do not instantiate any data reference (which utilizes a
1081          * device buffer) because we will be using direct IO via the
1082          * logical buffer cache buffer.
1083          */
1084         hmp = ip->hmp;
1085         *errorp = 0;
1086 retry:
1087         hammer2_inode_lock_ex(ip);
1088         parent = hammer2_chain_lookup_init(ip->chain, 0);
1089         chain = hammer2_chain_lookup(&parent,
1090                                      lbase, lbase,
1091                                      HAMMER2_LOOKUP_NODATA);
1092
1093         if (chain == NULL) {
1094                 /*
1095                  * We found a hole, create a new chain entry.
1096                  *
1097                  * NOTE: DATA chains are created without device backing
1098                  *       store (nor do we want any).
1099                  */
1100                 *errorp = hammer2_chain_create(trans, &parent, &chain,
1101                                                lbase, HAMMER2_PBUFRADIX,
1102                                                HAMMER2_BREF_TYPE_DATA,
1103                                                lblksize);
1104                 if (chain == NULL) {
1105                         hammer2_inode_unlock_ex(ip);
1106                         hammer2_chain_lookup_done(parent);
1107                         panic("hammer2_chain_create: par=%p error=%d\n",
1108                                 parent, *errorp);
1109                         goto retry;
1110                 }
1111
1112                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1113                 /*ip->delta_dcount += lblksize;*/
1114         } else {
1115                 switch (chain->bref.type) {
1116                 case HAMMER2_BREF_TYPE_INODE:
1117                         /*
1118                          * The data is embedded in the inode.  The
1119                          * caller is responsible for marking the inode
1120                          * modified and copying the data to the embedded
1121                          * area.
1122                          */
1123                         pbase = NOOFFSET;
1124                         break;
1125                 case HAMMER2_BREF_TYPE_DATA:
1126                         if (chain->bytes != lblksize) {
1127                                 panic("hammer2_assign_physical: "
1128                                       "size mismatch %d/%d\n",
1129                                       lblksize, chain->bytes);
1130                         }
1131                         hammer2_chain_modify(trans, chain,
1132                                              HAMMER2_MODIFY_OPTDATA);
1133                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1134                         break;
1135                 default:
1136                         panic("hammer2_assign_physical: bad type");
1137                         /* NOT REACHED */
1138                         pbase = NOOFFSET;
1139                         break;
1140                 }
1141         }
1142         if (chain)
1143                 hammer2_chain_unlock(chain);
1144         hammer2_chain_lookup_done(parent);
1145
1146         hammer2_inode_unlock_ex(ip);
1147
1148         return (pbase);
1149 }
1150
1151 /*
1152  * Truncate the size of a file.
1153  *
1154  * This routine adjusts ipdata->size smaller, destroying any related
1155  * data beyond the new EOF and potentially resizing the block straddling
1156  * the EOF.
1157  *
1158  * The inode must be locked.
1159  */
1160 static
1161 void
1162 hammer2_truncate_file(hammer2_trans_t *trans,
1163                       hammer2_inode_t *ip, hammer2_key_t nsize)
1164 {
1165         hammer2_inode_data_t *ipdata;
1166         hammer2_chain_t *parent;
1167         hammer2_chain_t *chain;
1168         hammer2_key_t lbase;
1169         hammer2_key_t leof;
1170         struct buf *bp;
1171         int loff;
1172         int error;
1173         int oblksize;
1174         int nblksize;
1175
1176         hammer2_chain_modify(trans, ip->chain, 0);
1177         bp = NULL;
1178         ipdata = &ip->chain->data->ipdata;
1179         error = 0;
1180
1181         /*
1182          * Destroy any logical buffer cache buffers beyond the file EOF.
1183          *
1184          * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
1185          * around with the buffer straddling EOF, because we need to assign
1186          * a new physical offset to it.
1187          */
1188         if (ip->vp) {
1189                 nvtruncbuf(ip->vp, nsize,
1190                            HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
1191                            1);
1192         }
1193
1194         /*
1195          * Setup for lookup/search
1196          */
1197         parent = hammer2_chain_lookup_init(ip->chain, 0);
1198
1199         /*
1200          * Handle the case where a chain/logical-buffer straddles the new
1201          * EOF.  We told nvtruncbuf() above not to mess with the logical
1202          * buffer straddling the EOF because we need to reassign its storage
1203          * and can't let the strategy code do it for us.
1204          */
1205         loff = (int)nsize & HAMMER2_PBUFMASK;
1206         if (loff && ip->vp) {
1207                 oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1208                 error = bread(ip->vp, lbase, oblksize, &bp);
1209                 KKASSERT(error == 0);
1210         }
1211         ipdata->size = nsize;
1212         nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1213
1214         /*
1215          * Fixup the chain element.  If we have a logical buffer in-hand
1216          * we don't want to create a conflicting device buffer.
1217          */
1218         if (loff && bp) {
1219                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
1220                                              HAMMER2_LOOKUP_NODATA);
1221                 if (chain) {
1222                         switch(chain->bref.type) {
1223                         case HAMMER2_BREF_TYPE_DATA:
1224                                 hammer2_chain_resize(trans, ip, bp,
1225                                              parent, &chain,
1226                                              hammer2_allocsize(nblksize),
1227                                              HAMMER2_MODIFY_OPTDATA);
1228                                 allocbuf(bp, nblksize);
1229                                 bzero(bp->b_data + loff, nblksize - loff);
1230                                 bp->b_bio2.bio_offset = chain->bref.data_off &
1231                                                         HAMMER2_OFF_MASK;
1232                                 break;
1233                         case HAMMER2_BREF_TYPE_INODE:
1234                                 allocbuf(bp, nblksize);
1235                                 bzero(bp->b_data + loff, nblksize - loff);
1236                                 bp->b_bio2.bio_offset = NOOFFSET;
1237                                 break;
1238                         default:
1239                                 panic("hammer2_truncate_file: bad type");
1240                                 break;
1241                         }
1242                         hammer2_chain_unlock(chain);
1243                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1244                                 bp->b_flags |= B_CLUSTEROK;
1245                         if (lbase == 0 &&
1246                             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA)) {
1247                                 /*
1248                                  * Must be synchronous if writing to the
1249                                  * inode's embedded data area.
1250                                  */
1251                                 bwrite(bp);
1252                         } else {
1253                                 /*
1254                                  * Else a delayed-write is fine.
1255                                  */
1256                                 bdwrite(bp);
1257                         }
1258                 } else {
1259                         /*
1260                          * Destroy clean buffer w/ wrong buffer size.  Retain
1261                          * backing store.
1262                          */
1263                         bp->b_flags |= B_RELBUF;
1264                         KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
1265                         KKASSERT((bp->b_flags & B_DIRTY) == 0);
1266                         bqrelse(bp);
1267                 }
1268         } else if (loff) {
1269                 /*
1270                  * WARNING: This utilizes a device buffer for the data.
1271                  *
1272                  * This case should not occur because file truncations without
1273                  * a vnode (and hence no logical buffer cache) should only
1274                  * always truncate to 0-length.
1275                  */
1276                 panic("hammer2_truncate_file: non-zero truncation, no-vnode");
1277 #if 0
1278                 chain = hammer2_chain_lookup(&parent, lbase, lbase, 0);
1279                 if (chain) {
1280                         switch(chain->bref.type) {
1281                         case HAMMER2_BREF_TYPE_DATA:
1282                                 chain = hammer2_chain_resize(trans, ip, bp,
1283                                              parent, chain,
1284                                              hammer2_allocsize(nblksize),
1285                                              0);
1286                                 hammer2_chain_modify(hmp, chain, 0);
1287                                 bzero(chain->data->buf + loff, nblksize - loff);
1288                                 break;
1289                         case HAMMER2_BREF_TYPE_INODE:
1290                                 if (loff < HAMMER2_EMBEDDED_BYTES) {
1291                                         hammer2_chain_modify(hmp, chain, 0);
1292                                         bzero(chain->data->ipdata.u.data + loff,
1293                                               HAMMER2_EMBEDDED_BYTES - loff);
1294                                 }
1295                                 break;
1296                         }
1297                         hammer2_chain_unlock(chain);
1298                 }
1299 #endif
1300         }
1301
1302         /*
1303          * Clean up any fragmentory VM pages now that we have properly
1304          * resized the straddling buffer.  These pages are no longer
1305          * part of the buffer.
1306          */
1307         if (ip->vp) {
1308                 nvtruncbuf(ip->vp, nsize,
1309                            nblksize, (int)nsize & (nblksize - 1),
1310                            1);
1311         }
1312
1313         /*
1314          * Destroy any physical blocks after the new EOF point.
1315          */
1316         lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
1317         chain = hammer2_chain_lookup(&parent,
1318                                      lbase, (hammer2_key_t)-1,
1319                                      HAMMER2_LOOKUP_NODATA);
1320         while (chain) {
1321                 /*
1322                  * Degenerate embedded data case, nothing to loop on.
1323                  */
1324                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1325                         hammer2_chain_unlock(chain);
1326                         break;
1327                 }
1328
1329                 /*
1330                  * Delete physical data blocks past the file EOF.
1331                  */
1332                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1333                         /*ip->delta_dcount -= chain->bytes;*/
1334                         hammer2_chain_delete(trans, parent, chain);
1335                 }
1336                 /* XXX check parent if empty indirect block & delete */
1337                 chain = hammer2_chain_next(&parent, chain,
1338                                            lbase, (hammer2_key_t)-1,
1339                                            HAMMER2_LOOKUP_NODATA);
1340         }
1341         hammer2_chain_lookup_done(parent);
1342 }
1343
1344 /*
1345  * Extend the size of a file.  The inode must be locked.
1346  *
1347  * We may have to resize the block straddling the old EOF.
1348  */
1349 static
1350 void
1351 hammer2_extend_file(hammer2_trans_t *trans,
1352                     hammer2_inode_t *ip, hammer2_key_t nsize)
1353 {
1354         hammer2_inode_data_t *ipdata;
1355         hammer2_mount_t *hmp;
1356         hammer2_chain_t *parent;
1357         hammer2_chain_t *chain;
1358         struct buf *bp;
1359         hammer2_key_t osize;
1360         hammer2_key_t obase;
1361         hammer2_key_t nbase;
1362         hammer2_key_t leof;
1363         int oblksize;
1364         int nblksize;
1365         int nradix;
1366         int error;
1367
1368         KKASSERT(ip->vp);
1369         hmp = ip->hmp;
1370
1371         hammer2_chain_modify(trans, ip->chain, 0);
1372         ipdata = &ip->chain->data->ipdata;
1373
1374         /*
1375          * Nothing to do if the direct-data case is still intact
1376          */
1377         if ((ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1378             nsize <= HAMMER2_EMBEDDED_BYTES) {
1379                 ipdata->size = nsize;
1380                 nvextendbuf(ip->vp,
1381                             ipdata->size, nsize,
1382                             0, HAMMER2_EMBEDDED_BYTES,
1383                             0, (int)nsize,
1384                             1);
1385                 /* ipdata = &ip->chain->data->ipdata; RELOAD */
1386                 return;
1387         }
1388
1389         /*
1390          * Calculate the blocksize at the original EOF and resize the block
1391          * if necessary.  Adjust the file size in the inode.
1392          */
1393         osize = ipdata->size;
1394         oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
1395         ipdata->size = nsize;
1396         nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
1397
1398         /*
1399          * Do all required vnode operations, but do not mess with the
1400          * buffer straddling the orignal EOF.
1401          */
1402         nvextendbuf(ip->vp,
1403                     ipdata->size, nsize,
1404                     0, nblksize,
1405                     0, (int)nsize & HAMMER2_PBUFMASK,
1406                     1);
1407         ipdata = &ip->chain->data->ipdata;
1408
1409         /*
1410          * Early return if we have no more work to do.
1411          */
1412         if (obase == nbase && oblksize == nblksize &&
1413             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1414                 return;
1415         }
1416
1417         /*
1418          * We have work to do, including possibly resizing the buffer
1419          * at the previous EOF point and turning off DIRECTDATA mode.
1420          */
1421         bp = NULL;
1422         if (((int)osize & HAMMER2_PBUFMASK)) {
1423                 error = bread(ip->vp, obase, oblksize, &bp);
1424                 KKASSERT(error == 0);
1425         }
1426
1427         /*
1428          * Disable direct-data mode by loading up a buffer cache buffer
1429          * with the data, then converting the inode data area into the
1430          * inode indirect block array area.
1431          */
1432         if (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1433                 ipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1434                 bzero(&ipdata->u.blockset, sizeof(ipdata->u.blockset));
1435         }
1436
1437         /*
1438          * Resize the chain element at the old EOF.
1439          */
1440         if (((int)osize & HAMMER2_PBUFMASK)) {
1441 retry:
1442                 error = 0;
1443                 parent = hammer2_chain_lookup_init(ip->chain, 0);
1444                 nradix = hammer2_allocsize(nblksize);
1445
1446                 chain = hammer2_chain_lookup(&parent,
1447                                              obase, obase,
1448                                              HAMMER2_LOOKUP_NODATA);
1449                 if (chain == NULL) {
1450                         error = hammer2_chain_create(trans, &parent, &chain,
1451                                                      obase, nblksize,
1452                                                      HAMMER2_BREF_TYPE_DATA,
1453                                                      nblksize);
1454                         if (chain == NULL) {
1455                                 hammer2_chain_lookup_done(parent);
1456                                 panic("hammer2_chain_create: par=%p error=%d\n",
1457                                         parent, error);
1458                                 goto retry;
1459                         }
1460                         /*ip->delta_dcount += nblksize;*/
1461                 } else {
1462                         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1463                         hammer2_chain_resize(trans, ip, bp,
1464                                              parent, &chain,
1465                                              nradix,
1466                                              HAMMER2_MODIFY_OPTDATA);
1467                 }
1468                 if (obase != nbase) {
1469                         if (oblksize != HAMMER2_PBUFSIZE)
1470                                 allocbuf(bp, HAMMER2_PBUFSIZE);
1471                 } else {
1472                         if (oblksize != nblksize)
1473                                 allocbuf(bp, nblksize);
1474                 }
1475                 bp->b_bio2.bio_offset = chain->bref.data_off &
1476                                         HAMMER2_OFF_MASK;
1477                 hammer2_chain_unlock(chain);
1478                 if (bp->b_bcount == HAMMER2_PBUFSIZE)
1479                         bp->b_flags |= B_CLUSTEROK;
1480                 bdwrite(bp);
1481                 hammer2_chain_lookup_done(parent);  /* must be after bdwrite */
1482         }
1483 }
1484
1485 static
1486 int
1487 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1488 {
1489         hammer2_inode_t *ip;
1490         hammer2_inode_t *dip;
1491         hammer2_mount_t *hmp;
1492         hammer2_chain_t *parent;
1493         hammer2_chain_t *chain;
1494         hammer2_chain_t *ochain;
1495         hammer2_trans_t trans;
1496         struct namecache *ncp;
1497         const uint8_t *name;
1498         size_t name_len;
1499         hammer2_key_t lhc;
1500         int error = 0;
1501         struct vnode *vp;
1502
1503         dip = VTOI(ap->a_dvp);
1504         hmp = dip->hmp;
1505         ncp = ap->a_nch->ncp;
1506         name = ncp->nc_name;
1507         name_len = ncp->nc_nlen;
1508         lhc = hammer2_dirhash(name, name_len);
1509
1510         /*
1511          * Note: In DragonFly the kernel handles '.' and '..'.
1512          */
1513         hammer2_inode_lock_sh(dip);
1514         parent = hammer2_chain_lookup_init(dip->chain, HAMMER2_LOOKUP_SHARED);
1515         chain = hammer2_chain_lookup(&parent,
1516                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1517                                      HAMMER2_LOOKUP_SHARED);
1518         while (chain) {
1519                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1520                     name_len == chain->data->ipdata.name_len &&
1521                     bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1522                         break;
1523                 }
1524                 chain = hammer2_chain_next(&parent, chain,
1525                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1526                                            HAMMER2_LOOKUP_SHARED);
1527         }
1528         hammer2_chain_lookup_done(parent);
1529         hammer2_inode_unlock_sh(dip);
1530
1531         /*
1532          * If the inode represents a forwarding entry for a hardlink we have
1533          * to locate the actual inode.  The original ip is saved for possible
1534          * deconsolidation.  (ip) will only be set to non-NULL when we have
1535          * to locate the real file via a hardlink.  ip will be referenced but
1536          * not locked in that situation.  chain is passed in locked and
1537          * returned locked.
1538          *
1539          * XXX what kind of chain lock?
1540          */
1541         ochain = NULL;
1542         if (chain && chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
1543                 error = hammer2_hardlink_find(dip, &chain, &ochain);
1544                 if (error) {
1545                         kprintf("hammer2: unable to find hardlink\n");
1546                         if (chain) {
1547                                 hammer2_chain_unlock(chain);
1548                                 chain = NULL;
1549                         }
1550                         goto failed;
1551                 }
1552         }
1553
1554         /*
1555          * Deconsolidate any hardlink whos nlinks == 1.  Ignore errors.
1556          * If an error occurs chain and ip are left alone.
1557          *
1558          * XXX upgrade shared lock?
1559          */
1560         if (ochain && chain && chain->data->ipdata.nlinks == 1 && !hmp->ronly) {
1561                 kprintf("hammer2: need to unconsolidate hardlink for %s\n",
1562                         chain->data->ipdata.filename);
1563                 /* XXX retain shared lock on dip? (currently not held) */
1564                 hammer2_trans_init(dip->hmp, &trans, 0);
1565                 hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain);
1566                 hammer2_trans_done(&trans);
1567         }
1568
1569         /*
1570          * Acquire the related vnode
1571          *
1572          * NOTE: For error processing, only ENOENT resolves the namecache
1573          *       entry to NULL, otherwise we just return the error and
1574          *       leave the namecache unresolved.
1575          *
1576          * NOTE: multiple hammer2_inode structures can be aliased to the
1577          *       same chain element, for example for hardlinks.  This
1578          *       use case does not 'reattach' inode associations that
1579          *       might already exist, but always allocates a new one.
1580          *
1581          * WARNING: inode structure is locked exclusively via inode_get
1582          *          but chain was locked shared.  inode_unlock_ex()
1583          *          will handle it properly.
1584          */
1585         if (chain) {
1586                 ip = hammer2_inode_get(hmp, dip->pmp, dip, chain);
1587                 vp = hammer2_igetv(ip, &error);
1588                 if (error == 0) {
1589                         vn_unlock(vp);
1590                         cache_setvp(ap->a_nch, vp);
1591                 } else if (error == ENOENT) {
1592                         cache_setvp(ap->a_nch, NULL);
1593                 }
1594                 hammer2_inode_unlock_ex(ip);
1595
1596                 /*
1597                  * The vp should not be released until after we've disposed
1598                  * of our locks, because it might cause vop_inactive() to
1599                  * be called.
1600                  */
1601                 if (vp)
1602                         vrele(vp);
1603         } else {
1604                 error = ENOENT;
1605                 cache_setvp(ap->a_nch, NULL);
1606         }
1607 failed:
1608         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1609                 ("resolve error %d/%p chain %p ap %p\n",
1610                  error, ap->a_nch->ncp->nc_vp, chain, ap));
1611         if (ochain)
1612                 hammer2_chain_drop(ochain);
1613         return error;
1614 }
1615
1616 static
1617 int
1618 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1619 {
1620         hammer2_inode_t *dip;
1621         hammer2_inode_t *ip;
1622         hammer2_mount_t *hmp;
1623         int error;
1624
1625         dip = VTOI(ap->a_dvp);
1626         hmp = dip->hmp;
1627
1628         if ((ip = dip->pip) == NULL) {
1629                 *ap->a_vpp = NULL;
1630                 return ENOENT;
1631         }
1632         hammer2_inode_lock_ex(ip);
1633         *ap->a_vpp = hammer2_igetv(ip, &error);
1634         hammer2_inode_unlock_ex(ip);
1635
1636         return error;
1637 }
1638
1639 static
1640 int
1641 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1642 {
1643         hammer2_mount_t *hmp;
1644         hammer2_inode_t *dip;
1645         hammer2_inode_t *nip;
1646         hammer2_trans_t trans;
1647         struct namecache *ncp;
1648         const uint8_t *name;
1649         size_t name_len;
1650         int error;
1651
1652         dip = VTOI(ap->a_dvp);
1653         hmp = dip->hmp;
1654         if (hmp->ronly)
1655                 return (EROFS);
1656
1657         ncp = ap->a_nch->ncp;
1658         name = ncp->nc_name;
1659         name_len = ncp->nc_nlen;
1660
1661         hammer2_trans_init(hmp, &trans, 0);
1662         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1663                                    name, name_len, &error);
1664         if (error) {
1665                 KKASSERT(nip == NULL);
1666                 *ap->a_vpp = NULL;
1667         } else {
1668                 *ap->a_vpp = hammer2_igetv(nip, &error);
1669                 hammer2_inode_unlock_ex(nip);
1670         }
1671         hammer2_trans_done(&trans);
1672
1673         if (error == 0) {
1674                 cache_setunresolved(ap->a_nch);
1675                 cache_setvp(ap->a_nch, *ap->a_vpp);
1676         }
1677         return error;
1678 }
1679
1680 /*
1681  * Return the largest contiguous physical disk range for the logical
1682  * request.
1683  *
1684  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
1685  */
1686 static
1687 int
1688 hammer2_vop_bmap(struct vop_bmap_args *ap)
1689 {
1690         struct vnode *vp;
1691         hammer2_mount_t *hmp;
1692         hammer2_inode_t *ip;
1693         hammer2_chain_t *parent;
1694         hammer2_chain_t *chain;
1695         hammer2_key_t lbeg;
1696         hammer2_key_t lend;
1697         hammer2_off_t pbeg;
1698         hammer2_off_t pbytes;
1699         hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
1700         int loff;
1701         int ai;
1702
1703         /*
1704          * Only supported on regular files
1705          *
1706          * Only supported for read operations (required for cluster_read).
1707          * The block allocation is delayed for write operations.
1708          */
1709         vp = ap->a_vp;
1710         if (vp->v_type != VREG)
1711                 return (EOPNOTSUPP);
1712         if (ap->a_cmd != BUF_CMD_READ)
1713                 return (EOPNOTSUPP);
1714
1715         ip = VTOI(vp);
1716         hmp = ip->hmp;
1717         bzero(array, sizeof(array));
1718
1719         /*
1720          * Calculate logical range
1721          */
1722         KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
1723         lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
1724         lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
1725         if (lend < lbeg)
1726                 lend = lbeg;
1727         loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
1728
1729         hammer2_inode_lock_sh(ip);
1730         parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
1731         chain = hammer2_chain_lookup(&parent,
1732                                      lbeg, lend,
1733                                      HAMMER2_LOOKUP_NODATA |
1734                                      HAMMER2_LOOKUP_SHARED);
1735         if (chain == NULL) {
1736                 *ap->a_doffsetp = ZFOFFSET;
1737                 hammer2_chain_lookup_done(parent);
1738                 hammer2_inode_unlock_sh(ip);
1739                 return (0);
1740         }
1741
1742         while (chain) {
1743                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1744                         ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
1745                         KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
1746                         array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
1747                         array[ai][1] = chain->bytes;
1748                 }
1749                 chain = hammer2_chain_next(&parent, chain,
1750                                            lbeg, lend,
1751                                            HAMMER2_LOOKUP_NODATA |
1752                                            HAMMER2_LOOKUP_SHARED);
1753         }
1754         hammer2_chain_lookup_done(parent);
1755         hammer2_inode_unlock_sh(ip);
1756
1757         /*
1758          * If the requested loffset is not mappable physically we can't
1759          * bmap.  The caller will have to access the file data via a
1760          * device buffer.
1761          */
1762         if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_LBUFSIZE) {
1763                 *ap->a_doffsetp = NOOFFSET;
1764                 return (0);
1765         }
1766
1767         /*
1768          * Calculate the physical disk offset range for array[0]
1769          */
1770         pbeg = array[0][0] + loff;
1771         pbytes = array[0][1] - loff;
1772
1773         for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
1774                 if (array[ai][0] != pbeg + pbytes)
1775                         break;
1776                 pbytes += array[ai][1];
1777         }
1778
1779         *ap->a_doffsetp = pbeg;
1780         if (ap->a_runp)
1781                 *ap->a_runp = pbytes;
1782         return (0);
1783 }
1784
1785 static
1786 int
1787 hammer2_vop_open(struct vop_open_args *ap)
1788 {
1789         return vop_stdopen(ap);
1790 }
1791
1792 /*
1793  * hammer2_vop_advlock { vp, id, op, fl, flags }
1794  */
1795 static
1796 int
1797 hammer2_vop_advlock(struct vop_advlock_args *ap)
1798 {
1799         hammer2_inode_t *ip = VTOI(ap->a_vp);
1800         hammer2_off_t size;
1801
1802         hammer2_inode_lock_sh(ip);
1803         size = ip->chain->data->ipdata.size;
1804         hammer2_inode_unlock_sh(ip);
1805         return (lf_advlock(ap, &ip->advlock, size));
1806 }
1807
1808
1809 static
1810 int
1811 hammer2_vop_close(struct vop_close_args *ap)
1812 {
1813         return vop_stdclose(ap);
1814 }
1815
1816 /*
1817  * hammer2_vop_nlink { nch, dvp, vp, cred }
1818  *
1819  * Create a hardlink from (vp) to {dvp, nch}.
1820  */
1821 static
1822 int
1823 hammer2_vop_nlink(struct vop_nlink_args *ap)
1824 {
1825         hammer2_inode_t *dip;   /* target directory to create link in */
1826         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1827         hammer2_mount_t *hmp;
1828         hammer2_chain_t *chain;
1829         hammer2_trans_t trans;
1830         struct namecache *ncp;
1831         const uint8_t *name;
1832         size_t name_len;
1833         int error;
1834
1835         dip = VTOI(ap->a_dvp);
1836         hmp = dip->hmp;
1837         if (hmp->ronly)
1838                 return (EROFS);
1839
1840         ncp = ap->a_nch->ncp;
1841         name = ncp->nc_name;
1842         name_len = ncp->nc_nlen;
1843         hammer2_trans_init(hmp, &trans, 0);
1844
1845         /*
1846          * ip represents the file being hardlinked.  The file could be a
1847          * normal file or a hardlink target if it has already been hardlinked.
1848          * If ip is a hardlinked target then ip->pip represents the location
1849          * of the hardlinked target, NOT the location of the hardlink pointer.
1850          *
1851          * Bump nlinks and potentially also create or move the hardlink
1852          * target in the parent directory common to (ip) and (dip).  The
1853          * consolidation code can modify ip->chain and ip->pip.  The
1854          * returned chain is locked.
1855          */
1856         ip = VTOI(ap->a_vp);
1857         hammer2_inode_lock_ex(ip);
1858         error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1);
1859         if (error)
1860                 goto done;
1861
1862         /*
1863          * Create a directory entry connected to the specified chain.
1864          * The hardlink consolidation code has already adjusted ip->pip
1865          * to the common parent directory containing the actual hardlink
1866          *
1867          * (which may be different from dip where we created our hardlink
1868          * entry. ip->chain always represents the actual hardlink and not
1869          * any of the pointers to the actual hardlink).
1870          */
1871         error = hammer2_inode_connect(&trans, 1,
1872                                       dip, &chain,
1873                                       name, name_len);
1874         if (error == 0) {
1875                 cache_setunresolved(ap->a_nch);
1876                 cache_setvp(ap->a_nch, ap->a_vp);
1877         }
1878 done:
1879         if (chain)
1880                 hammer2_chain_unlock(chain);
1881         hammer2_inode_unlock_ex(ip);
1882         hammer2_trans_done(&trans);
1883
1884         return error;
1885 }
1886
1887 /*
1888  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1889  *
1890  * The operating system has already ensured that the directory entry
1891  * does not exist and done all appropriate namespace locking.
1892  */
1893 static
1894 int
1895 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1896 {
1897         hammer2_mount_t *hmp;
1898         hammer2_inode_t *dip;
1899         hammer2_inode_t *nip;
1900         hammer2_trans_t trans;
1901         struct namecache *ncp;
1902         const uint8_t *name;
1903         size_t name_len;
1904         int error;
1905
1906         dip = VTOI(ap->a_dvp);
1907         hmp = dip->hmp;
1908         if (hmp->ronly)
1909                 return (EROFS);
1910
1911         ncp = ap->a_nch->ncp;
1912         name = ncp->nc_name;
1913         name_len = ncp->nc_nlen;
1914         hammer2_trans_init(hmp, &trans, 0);
1915
1916         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1917                                    name, name_len, &error);
1918         if (error) {
1919                 KKASSERT(nip == NULL);
1920                 *ap->a_vpp = NULL;
1921         } else {
1922                 *ap->a_vpp = hammer2_igetv(nip, &error);
1923                 hammer2_inode_unlock_ex(nip);
1924         }
1925         hammer2_trans_done(&trans);
1926
1927         if (error == 0) {
1928                 cache_setunresolved(ap->a_nch);
1929                 cache_setvp(ap->a_nch, *ap->a_vpp);
1930         }
1931         return error;
1932 }
1933
1934 /*
1935  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1936  */
1937 static
1938 int
1939 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1940 {
1941         hammer2_mount_t *hmp;
1942         hammer2_inode_t *dip;
1943         hammer2_inode_t *nip;
1944         hammer2_trans_t trans;
1945         struct namecache *ncp;
1946         const uint8_t *name;
1947         size_t name_len;
1948         int error;
1949
1950         dip = VTOI(ap->a_dvp);
1951         hmp = dip->hmp;
1952         if (hmp->ronly)
1953                 return (EROFS);
1954
1955         ncp = ap->a_nch->ncp;
1956         name = ncp->nc_name;
1957         name_len = ncp->nc_nlen;
1958         hammer2_trans_init(hmp, &trans, 0);
1959
1960         ap->a_vap->va_type = VLNK;      /* enforce type */
1961
1962         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1963                                    name, name_len, &error);
1964         if (error) {
1965                 KKASSERT(nip == NULL);
1966                 *ap->a_vpp = NULL;
1967                 hammer2_trans_done(&trans);
1968                 return error;
1969         }
1970         *ap->a_vpp = hammer2_igetv(nip, &error);
1971
1972         /*
1973          * Build the softlink (~like file data) and finalize the namecache.
1974          */
1975         if (error == 0) {
1976                 size_t bytes;
1977                 struct uio auio;
1978                 struct iovec aiov;
1979                 hammer2_inode_data_t *nipdata;
1980
1981                 nipdata = &nip->chain->data->ipdata;
1982                 bytes = strlen(ap->a_target);
1983
1984                 if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1985                         KKASSERT(nipdata->op_flags &
1986                                  HAMMER2_OPFLAG_DIRECTDATA);
1987                         bcopy(ap->a_target, nipdata->u.data, bytes);
1988                         nipdata->size = bytes;
1989                 } else {
1990                         bzero(&auio, sizeof(auio));
1991                         bzero(&aiov, sizeof(aiov));
1992                         auio.uio_iov = &aiov;
1993                         auio.uio_segflg = UIO_SYSSPACE;
1994                         auio.uio_rw = UIO_WRITE;
1995                         auio.uio_resid = bytes;
1996                         auio.uio_iovcnt = 1;
1997                         auio.uio_td = curthread;
1998                         aiov.iov_base = ap->a_target;
1999                         aiov.iov_len = bytes;
2000                         error = hammer2_write_file(nip, &trans,
2001                                                    &auio, IO_APPEND, 0);
2002                         nipdata = &nip->chain->data->ipdata; /* RELOAD */
2003                         /* XXX handle error */
2004                         error = 0;
2005                 }
2006         }
2007         hammer2_inode_unlock_ex(nip);
2008         hammer2_trans_done(&trans);
2009
2010         /*
2011          * Finalize namecache
2012          */
2013         if (error == 0) {
2014                 cache_setunresolved(ap->a_nch);
2015                 cache_setvp(ap->a_nch, *ap->a_vpp);
2016                 /* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
2017         }
2018         return error;
2019 }
2020
2021 /*
2022  * hammer2_vop_nremove { nch, dvp, cred }
2023  */
2024 static
2025 int
2026 hammer2_vop_nremove(struct vop_nremove_args *ap)
2027 {
2028         hammer2_inode_t *dip;
2029         hammer2_mount_t *hmp;
2030         hammer2_trans_t trans;
2031         struct namecache *ncp;
2032         const uint8_t *name;
2033         size_t name_len;
2034         int error;
2035
2036         dip = VTOI(ap->a_dvp);
2037         hmp = dip->hmp;
2038         if (hmp->ronly)
2039                 return(EROFS);
2040
2041         ncp = ap->a_nch->ncp;
2042         name = ncp->nc_name;
2043         name_len = ncp->nc_nlen;
2044         hammer2_trans_init(hmp, &trans, 0);
2045         error = hammer2_unlink_file(&trans, dip, name, name_len, 0, NULL);
2046         hammer2_trans_done(&trans);
2047         if (error == 0) {
2048                 cache_unlink(ap->a_nch);
2049         }
2050         return (error);
2051 }
2052
2053 /*
2054  * hammer2_vop_nrmdir { nch, dvp, cred }
2055  */
2056 static
2057 int
2058 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
2059 {
2060         hammer2_inode_t *dip;
2061         hammer2_mount_t *hmp;
2062         hammer2_trans_t trans;
2063         struct namecache *ncp;
2064         const uint8_t *name;
2065         size_t name_len;
2066         int error;
2067
2068         dip = VTOI(ap->a_dvp);
2069         hmp = dip->hmp;
2070         if (hmp->ronly)
2071                 return(EROFS);
2072
2073         ncp = ap->a_nch->ncp;
2074         name = ncp->nc_name;
2075         name_len = ncp->nc_nlen;
2076
2077         hammer2_trans_init(hmp, &trans, 0);
2078         error = hammer2_unlink_file(&trans, dip, name, name_len, 1, NULL);
2079         hammer2_trans_done(&trans);
2080         if (error == 0) {
2081                 cache_unlink(ap->a_nch);
2082         }
2083         return (error);
2084 }
2085
2086 /*
2087  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
2088  */
2089 static
2090 int
2091 hammer2_vop_nrename(struct vop_nrename_args *ap)
2092 {
2093         struct namecache *fncp;
2094         struct namecache *tncp;
2095         hammer2_inode_t *fdip;
2096         hammer2_inode_t *tdip;
2097         hammer2_inode_t *ip;
2098         hammer2_chain_t *chain;
2099         hammer2_mount_t *hmp;
2100         hammer2_trans_t trans;
2101         const uint8_t *fname;
2102         size_t fname_len;
2103         const uint8_t *tname;
2104         size_t tname_len;
2105         int error;
2106         int hlink;
2107
2108         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
2109                 return(EXDEV);
2110         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
2111                 return(EXDEV);
2112
2113         fdip = VTOI(ap->a_fdvp);        /* source directory */
2114         tdip = VTOI(ap->a_tdvp);        /* target directory */
2115
2116         hmp = fdip->hmp;                /* check read-only filesystem */
2117         if (hmp->ronly)
2118                 return(EROFS);
2119
2120         fncp = ap->a_fnch->ncp;         /* entry name in source */
2121         fname = fncp->nc_name;
2122         fname_len = fncp->nc_nlen;
2123
2124         tncp = ap->a_tnch->ncp;         /* entry name in target */
2125         tname = tncp->nc_name;
2126         tname_len = tncp->nc_nlen;
2127
2128         hammer2_trans_init(hmp, &trans, 0);
2129
2130         /*
2131          * ip is the inode being renamed.  If this is a hardlink then
2132          * ip represents the actual file and not the hardlink marker.
2133          */
2134         ip = VTOI(fncp->nc_vp);
2135         chain = NULL;
2136
2137         /*
2138          * Keep a tight grip on the inode so the temporary unlinking from
2139          * the source location prior to linking to the target location
2140          * does not cause the chain to be destroyed.
2141          *
2142          * NOTE: To avoid deadlocks we cannot lock (ip) while we are
2143          *       unlinking elements from their directories.  Locking
2144          *       the nlinks field does not lock the whole inode.
2145          */
2146         hammer2_inode_ref(ip);
2147
2148         /*
2149          * Remove target if it exists
2150          */
2151         error = hammer2_unlink_file(&trans, tdip, tname, tname_len, -1, NULL);
2152         if (error && error != ENOENT)
2153                 goto done;
2154         cache_setunresolved(ap->a_tnch);
2155
2156         /*
2157          * When renaming a hardlinked file we may have to re-consolidate
2158          * the location of the hardlink target.  Since the element is simply
2159          * being moved, nlinks is not modified in this case.
2160          *
2161          * If ip represents a regular file the consolidation code essentially
2162          * does nothing other than return the locked chain.
2163          *
2164          * The returned chain will be locked.
2165          *
2166          * WARNING!  We do not currently have a local copy of ipdata but
2167          *           we do use one later remember that it must be reloaded
2168          *           on any modification to the inode, including connects.
2169          */
2170         hammer2_inode_lock_ex(ip);
2171         error = hammer2_hardlink_consolidate(&trans, ip, &chain, tdip, 0);
2172         if (error)
2173                 goto done;
2174
2175         /*
2176          * Disconnect (fdip, fname) from the source directory.  This will
2177          * disconnect (ip) if it represents a direct file.  If (ip) represents
2178          * a hardlink the HARDLINK pointer object will be removed but the
2179          * hardlink will stay intact.
2180          *
2181          * The target chain may be marked DELETED but will not be destroyed
2182          * since we retain our hold on ip and chain.
2183          */
2184         error = hammer2_unlink_file(&trans, fdip, fname, fname_len, -1, &hlink);
2185         KKASSERT(error != EAGAIN);
2186         if (error)
2187                 goto done;
2188
2189         /*
2190          * Reconnect ip to target directory using chain.  Chains cannot
2191          * actually be moved, so this will duplicate the chain in the new
2192          * spot and assign it to the ip, replacing the old chain.
2193          *
2194          * WARNING: chain locks can lock buffer cache buffers, to avoid
2195          *          deadlocks we want to unlock before issuing a cache_*()
2196          *          op (that might have to lock a vnode).
2197          */
2198         error = hammer2_inode_connect(&trans, hlink,
2199                                       tdip, &chain,
2200                                       tname, tname_len);
2201         if (error == 0) {
2202                 KKASSERT(chain != NULL);
2203                 hammer2_inode_repoint(ip, (hlink ? ip->pip : tdip), chain);
2204                 cache_rename(ap->a_fnch, ap->a_tnch);
2205         }
2206 done:
2207         if (chain)
2208                 hammer2_chain_unlock(chain);
2209         hammer2_inode_unlock_ex(ip);
2210         hammer2_inode_drop(ip);
2211         hammer2_trans_done(&trans);
2212
2213         return (error);
2214 }
2215
2216 /*
2217  * Strategy code
2218  *
2219  * WARNING: The strategy code cannot safely use hammer2 transactions
2220  *          as this can deadlock against vfs_sync's vfsync() call
2221  *          if multiple flushes are queued.
2222  */
2223 static int hammer2_strategy_read(struct vop_strategy_args *ap);
2224 static int hammer2_strategy_write(struct vop_strategy_args *ap);
2225
2226 static
2227 int
2228 hammer2_vop_strategy(struct vop_strategy_args *ap)
2229 {
2230         struct bio *biop;
2231         struct buf *bp;
2232         int error;
2233
2234         biop = ap->a_bio;
2235         bp = biop->bio_buf;
2236
2237         switch(bp->b_cmd) {
2238         case BUF_CMD_READ:
2239                 error = hammer2_strategy_read(ap);
2240                 ++hammer2_iod_file_read;
2241                 break;
2242         case BUF_CMD_WRITE:
2243                 error = hammer2_strategy_write(ap);
2244                 ++hammer2_iod_file_write;
2245                 break;
2246         default:
2247                 bp->b_error = error = EINVAL;
2248                 bp->b_flags |= B_ERROR;
2249                 biodone(biop);
2250                 break;
2251         }
2252
2253         return (error);
2254 }
2255
2256 static
2257 int
2258 hammer2_strategy_read(struct vop_strategy_args *ap)
2259 {
2260         struct buf *bp;
2261         struct bio *bio;
2262         struct bio *nbio;
2263         hammer2_mount_t *hmp;
2264         hammer2_inode_t *ip;
2265         hammer2_chain_t *parent;
2266         hammer2_chain_t *chain;
2267         hammer2_key_t lbase;
2268
2269         bio = ap->a_bio;
2270         bp = bio->bio_buf;
2271         ip = VTOI(ap->a_vp);
2272         hmp = ip->hmp;
2273         nbio = push_bio(bio);
2274
2275         lbase = bio->bio_offset;
2276         chain = NULL;
2277         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
2278
2279         /*
2280          * We must characterize the logical->physical translation if it
2281          * has not already been cached.
2282          *
2283          * Physical data references < LBUFSIZE are never cached.  This
2284          * includes both small-block allocations and inode-embedded data.
2285          */
2286         if (nbio->bio_offset == NOOFFSET) {
2287                 hammer2_inode_lock_sh(ip);
2288
2289                 parent = hammer2_chain_lookup_init(ip->chain,
2290                                                    HAMMER2_LOOKUP_SHARED);
2291
2292                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
2293                                              HAMMER2_LOOKUP_NODATA |
2294                                              HAMMER2_LOOKUP_SHARED);
2295                 if (chain == NULL) {
2296                         /*
2297                          * Data is zero-fill
2298                          */
2299                         nbio->bio_offset = ZFOFFSET;
2300                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2301                         /*
2302                          * Data is embedded in the inode (do nothing)
2303                          */
2304                         KKASSERT(chain == parent);
2305                         hammer2_chain_unlock(chain);
2306                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2307                         /*
2308                          * Data is on-media
2309                          */
2310                         KKASSERT(bp->b_bcount == chain->bytes);
2311                         nbio->bio_offset = chain->bref.data_off &
2312                                            HAMMER2_OFF_MASK;
2313                         hammer2_chain_unlock(chain);
2314                         KKASSERT(nbio->bio_offset != 0);
2315                 } else {
2316                         panic("hammer2_strategy_read: unknown bref type");
2317                 }
2318                 hammer2_chain_lookup_done(parent);
2319                 hammer2_inode_unlock_sh(ip);
2320         }
2321
2322         if (hammer2_debug & 0x0020) {
2323                 kprintf("read %016jx %016jx\n",
2324                         bio->bio_offset, nbio->bio_offset);
2325         }
2326
2327         if (nbio->bio_offset == ZFOFFSET) {
2328                 /*
2329                  * Data is zero-fill
2330                  */
2331                 bp->b_resid = 0;
2332                 bp->b_error = 0;
2333                 bzero(bp->b_data, bp->b_bcount);
2334                 biodone(nbio);
2335         } else if (nbio->bio_offset != NOOFFSET) {
2336                 /*
2337                  * Forward direct IO to the device
2338                  */
2339                 vn_strategy(hmp->devvp, nbio);
2340         } else {
2341                 /*
2342                  * Data is embedded in inode.
2343                  */
2344                 bcopy(chain->data->ipdata.u.data, bp->b_data,
2345                       HAMMER2_EMBEDDED_BYTES);
2346                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
2347                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
2348                 bp->b_resid = 0;
2349                 bp->b_error = 0;
2350                 biodone(nbio);
2351         }
2352         return (0);
2353 }
2354
2355 static
2356 int
2357 hammer2_strategy_write(struct vop_strategy_args *ap)
2358 {
2359         struct buf *bp;
2360         struct bio *bio;
2361         struct bio *nbio;
2362         hammer2_chain_t *chain;
2363         hammer2_mount_t *hmp;
2364         hammer2_inode_t *ip;
2365
2366         bio = ap->a_bio;
2367         bp = bio->bio_buf;
2368         ip = VTOI(ap->a_vp);
2369         hmp = ip->hmp;
2370         nbio = push_bio(bio);
2371
2372         KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
2373         KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
2374
2375         if (nbio->bio_offset == NOOFFSET) {
2376                 /*
2377                  * The data is embedded in the inode.  Note that strategy
2378                  * calls for embedded data are synchronous in order to
2379                  * ensure that ip->chain is stable.
2380                  */
2381                 KKASSERT(bio->bio_offset == 0);
2382                 KKASSERT(ip->chain && ip->chain->data);
2383                 chain = ip->chain;
2384                 bcopy(bp->b_data, chain->data->ipdata.u.data,
2385                       HAMMER2_EMBEDDED_BYTES);
2386                 bp->b_resid = 0;
2387                 bp->b_error = 0;
2388                 biodone(nbio);
2389
2390                 /*
2391                  * This special flag does not follow the normal MODIFY rules
2392                  * because we might deadlock on ip.  Instead we depend on
2393                  * VOP_FSYNC() to detect the case.
2394                  */
2395                 atomic_set_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
2396         } else {
2397                 /*
2398                  * Forward direct IO to the device
2399                  */
2400                 vn_strategy(hmp->devvp, nbio);
2401         }
2402         return (0);
2403 }
2404
2405 /*
2406  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2407  */
2408 static
2409 int
2410 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2411 {
2412         hammer2_mount_t *hmp;
2413         hammer2_inode_t *ip;
2414         int error;
2415
2416         ip = VTOI(ap->a_vp);
2417         hmp = ip->hmp;
2418
2419         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2420                               ap->a_fflag, ap->a_cred);
2421         return (error);
2422 }
2423
2424 static
2425 int 
2426 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2427 {
2428         struct mount *mp;
2429         hammer2_pfsmount_t *pmp;
2430         int rc;
2431
2432         switch (ap->a_op) {
2433         case (MOUNTCTL_SET_EXPORT):
2434                 mp = ap->a_head.a_ops->head.vv_mount;
2435                 pmp = MPTOPMP(mp);
2436
2437                 if (ap->a_ctllen != sizeof(struct export_args))
2438                         rc = (EINVAL);
2439                 else
2440                         rc = vfs_export(mp, &pmp->export,
2441                                         (const struct export_args *)ap->a_ctl);
2442                 break;
2443         default:
2444                 rc = vop_stdmountctl(ap);
2445                 break;
2446         }
2447         return (rc);
2448 }
2449
2450 struct vop_ops hammer2_vnode_vops = {
2451         .vop_default    = vop_defaultop,
2452         .vop_fsync      = hammer2_vop_fsync,
2453         .vop_getpages   = vop_stdgetpages,
2454         .vop_putpages   = vop_stdputpages,
2455         .vop_access     = hammer2_vop_access,
2456         .vop_advlock    = hammer2_vop_advlock,
2457         .vop_close      = hammer2_vop_close,
2458         .vop_nlink      = hammer2_vop_nlink,
2459         .vop_ncreate    = hammer2_vop_ncreate,
2460         .vop_nsymlink   = hammer2_vop_nsymlink,
2461         .vop_nremove    = hammer2_vop_nremove,
2462         .vop_nrmdir     = hammer2_vop_nrmdir,
2463         .vop_nrename    = hammer2_vop_nrename,
2464         .vop_getattr    = hammer2_vop_getattr,
2465         .vop_setattr    = hammer2_vop_setattr,
2466         .vop_readdir    = hammer2_vop_readdir,
2467         .vop_readlink   = hammer2_vop_readlink,
2468         .vop_getpages   = vop_stdgetpages,
2469         .vop_putpages   = vop_stdputpages,
2470         .vop_read       = hammer2_vop_read,
2471         .vop_write      = hammer2_vop_write,
2472         .vop_open       = hammer2_vop_open,
2473         .vop_inactive   = hammer2_vop_inactive,
2474         .vop_reclaim    = hammer2_vop_reclaim,
2475         .vop_nresolve   = hammer2_vop_nresolve,
2476         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2477         .vop_nmkdir     = hammer2_vop_nmkdir,
2478         .vop_ioctl      = hammer2_vop_ioctl,
2479         .vop_mountctl   = hammer2_vop_mountctl,
2480         .vop_bmap       = hammer2_vop_bmap,
2481         .vop_strategy   = hammer2_vop_strategy,
2482 };
2483
2484 struct vop_ops hammer2_spec_vops = {
2485
2486 };
2487
2488 struct vop_ops hammer2_fifo_vops = {
2489
2490 };