Merge branch 'master' of ssh://crater.dragonflybsd.org/repository/git/dragonfly
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
39 #include <sys/buf.h>
40 #include <sys/proc.h>
41 #include <sys/namei.h>
42 #include <sys/mount.h>
43 #include <sys/vnode.h>
44 #include <sys/mountctl.h>
45 #include <sys/dirent.h>
46 #include <sys/uio.h>
47
48 #include "hammer2.h"
49
50 #define ZFOFFSET        (-2LL)
51
52 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
53                                 int seqcount);
54 static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
55                                 int ioflag, int seqcount);
56 static hammer2_off_t hammer2_assign_physical(hammer2_trans_t *trans,
57                                 hammer2_inode_t *ip,
58                                 hammer2_key_t lbase, int lblksize,
59                                 int *errorp);
60 static void hammer2_extend_file(hammer2_trans_t *trans,
61                                 hammer2_inode_t *ip, hammer2_key_t nsize);
62 static void hammer2_truncate_file(hammer2_trans_t *trans,
63                                 hammer2_inode_t *ip, hammer2_key_t nsize);
64
65 static __inline
66 void
67 hammer2_knote(struct vnode *vp, int flags)
68 {
69         if (flags)
70                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
71 }
72
73 /*
74  * Last reference to a vnode is going away but it is still cached.
75  */
76 static
77 int
78 hammer2_vop_inactive(struct vop_inactive_args *ap)
79 {
80         hammer2_inode_t *ip;
81         hammer2_trans_t trans;
82         struct vnode *vp;
83 #if 0
84         struct hammer2_mount *hmp;
85 #endif
86
87         vp = ap->a_vp;
88         ip = VTOI(vp);
89
90         /*
91          * Degenerate case
92          */
93         if (ip == NULL) {
94                 vrecycle(vp);
95                 return (0);
96         }
97
98         /*
99          * Detect updates to the embedded data which may be synchronized by
100          * the strategy code.  Simply mark the inode modified so it gets
101          * picked up by our normal flush.
102          */
103         hammer2_inode_lock_ex(ip);
104         KKASSERT(ip->chain);
105         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
106                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
107                 hammer2_trans_init(&trans, ip->hmp);
108                 hammer2_chain_modify(&trans, ip->chain, 0);
109                 hammer2_trans_done(&trans);
110         }
111
112         /*
113          * Check for deleted inodes and recycle immediately.
114          */
115         if (ip->chain->flags & HAMMER2_CHAIN_DELETED) {
116                 hammer2_inode_unlock_ex(ip);
117                 vrecycle(vp);
118         } else {
119                 hammer2_inode_unlock_ex(ip);
120         }
121         return (0);
122 }
123
124 /*
125  * Reclaim a vnode so that it can be reused; after the inode is
126  * disassociated, the filesystem must manage it alone.
127  */
128 static
129 int
130 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
131 {
132         hammer2_chain_t *chain;
133         hammer2_inode_t *ip;
134         hammer2_mount_t *hmp;
135         hammer2_trans_t trans;
136         struct vnode *vp;
137
138         vp = ap->a_vp;
139         ip = VTOI(vp);
140         if (ip == NULL)
141                 return(0);
142         hmp = ip->hmp;
143
144         /*
145          * Set SUBMODIFIED so we can detect and propagate the DESTROYED
146          * bit in the flush code.
147          */
148         hammer2_inode_lock_ex(ip);
149         chain = ip->chain;
150         vp->v_data = NULL;
151         ip->vp = NULL;
152         if (chain->flags & HAMMER2_CHAIN_DELETED) {
153                 KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED);
154                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROYED |
155                                               HAMMER2_CHAIN_SUBMODIFIED);
156         }
157         if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
158                             HAMMER2_CHAIN_DELETED |
159                             HAMMER2_CHAIN_SUBMODIFIED)) {
160                 hammer2_trans_init(&trans, ip->hmp);
161                 hammer2_chain_flush(&trans, chain);
162                 hammer2_trans_done(&trans);
163         }
164         if (ip->refs > 2)                           /* (our lock + vp ref) */
165                 hammer2_inode_unlock_ex(ip);        /* unlock */
166         else
167                 hammer2_inode_put(ip);              /* unlock & disconnect */
168         /* chain no longer referenced */
169         /* chain = NULL; not needed */
170         hammer2_inode_drop(ip);                     /* vp ref */
171
172         /*
173          * XXX handle background sync when ip dirty, kernel will no longer
174          * notify us regarding this inode because there is no longer a
175          * vnode attached to it.
176          */
177
178         return (0);
179 }
180
181 static
182 int
183 hammer2_vop_fsync(struct vop_fsync_args *ap)
184 {
185         hammer2_inode_t *ip;
186         hammer2_trans_t trans;
187         struct vnode *vp;
188
189         vp = ap->a_vp;
190         ip = VTOI(vp);
191
192         hammer2_trans_init(&trans, ip->hmp);
193         hammer2_inode_lock_ex(ip);
194
195         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
196
197         /*
198          * Detect updates to the embedded data which may be synchronized by
199          * the strategy code.  Simply mark the inode modified so it gets
200          * picked up by our normal flush.
201          */
202         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
203                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
204                 hammer2_chain_modify(&trans, ip->chain, 0);
205         }
206
207         /*
208          * Calling chain_flush here creates a lot of duplicative
209          * COW operations due to non-optimal vnode ordering.
210          *
211          * Only do it for an actual fsync() syscall.  The other forms
212          * which call this function will eventually call chain_flush
213          * on the volume root as a catch-all, which is far more optimal.
214          */
215         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
216         if (ap->a_flags & VOP_FSYNC_SYSCALL)
217                 hammer2_chain_flush(&trans, ip->chain);
218         hammer2_inode_unlock_ex(ip);
219         hammer2_trans_done(&trans);
220         return (0);
221 }
222
223 static
224 int
225 hammer2_vop_access(struct vop_access_args *ap)
226 {
227         hammer2_inode_t *ip = VTOI(ap->a_vp);
228         hammer2_inode_data_t *ipdata;
229         uid_t uid;
230         gid_t gid;
231         int error;
232
233         hammer2_inode_lock_sh(ip);
234         ipdata = &ip->chain->data->ipdata;
235         uid = hammer2_to_unix_xid(&ipdata->uid);
236         gid = hammer2_to_unix_xid(&ipdata->gid);
237         error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
238         hammer2_inode_unlock_sh(ip);
239
240         return (error);
241 }
242
243 static
244 int
245 hammer2_vop_getattr(struct vop_getattr_args *ap)
246 {
247         hammer2_inode_data_t *ipdata;
248         hammer2_pfsmount_t *pmp;
249         hammer2_inode_t *ip;
250         struct vnode *vp;
251         struct vattr *vap;
252
253         vp = ap->a_vp;
254         vap = ap->a_vap;
255
256         ip = VTOI(vp);
257         pmp = ip->pmp;
258
259         hammer2_inode_lock_sh(ip);
260         ipdata = &ip->chain->data->ipdata;
261
262         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
263         vap->va_fileid = ipdata->inum;
264         vap->va_mode = ipdata->mode;
265         vap->va_nlink = ipdata->nlinks;
266         vap->va_uid = hammer2_to_unix_xid(&ipdata->uid);
267         vap->va_gid = hammer2_to_unix_xid(&ipdata->gid);
268         vap->va_rmajor = 0;
269         vap->va_rminor = 0;
270         vap->va_size = ipdata->size;
271         vap->va_blocksize = HAMMER2_PBUFSIZE;
272         vap->va_flags = ipdata->uflags;
273         hammer2_time_to_timespec(ipdata->ctime, &vap->va_ctime);
274         hammer2_time_to_timespec(ipdata->mtime, &vap->va_mtime);
275         hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
276         vap->va_gen = 1;
277         vap->va_bytes = vap->va_size;   /* XXX */
278         vap->va_type = hammer2_get_vtype(ip->chain);
279         vap->va_filerev = 0;
280         vap->va_uid_uuid = ipdata->uid;
281         vap->va_gid_uuid = ipdata->gid;
282         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
283                           VA_FSID_UUID_VALID;
284
285         hammer2_inode_unlock_sh(ip);
286
287         return (0);
288 }
289
290 static
291 int
292 hammer2_vop_setattr(struct vop_setattr_args *ap)
293 {
294         hammer2_inode_data_t *ipdata;
295         hammer2_inode_t *ip;
296         hammer2_mount_t *hmp;
297         hammer2_trans_t trans;
298         struct vnode *vp;
299         struct vattr *vap;
300         int error;
301         int kflags = 0;
302         int domtime = 0;
303         uint64_t ctime;
304
305         vp = ap->a_vp;
306         vap = ap->a_vap;
307         hammer2_update_time(&ctime);
308
309         ip = VTOI(vp);
310         hmp = ip->hmp;
311
312         if (hmp->ronly)
313                 return(EROFS);
314
315         hammer2_trans_init(&trans, hmp);
316         hammer2_inode_lock_ex(ip);
317         ipdata = &ip->chain->data->ipdata;
318         error = 0;
319
320         if (vap->va_flags != VNOVAL) {
321                 u_int32_t flags;
322
323                 flags = ipdata->uflags;
324                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
325                                          hammer2_to_unix_xid(&ipdata->uid),
326                                          ap->a_cred);
327                 if (error == 0) {
328                         if (ipdata->uflags != flags) {
329                                 hammer2_chain_modify(&trans, ip->chain, 0);
330                                 ipdata->uflags = flags;
331                                 ipdata->ctime = ctime;
332                                 kflags |= NOTE_ATTRIB;
333                         }
334                         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
335                                 error = 0;
336                                 goto done;
337                         }
338                 }
339                 goto done;
340         }
341         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
342                 error = EPERM;
343                 goto done;
344         }
345         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
346                 mode_t cur_mode = ipdata->mode;
347                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
348                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
349                 uuid_t uuid_uid;
350                 uuid_t uuid_gid;
351
352                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
353                                          ap->a_cred,
354                                          &cur_uid, &cur_gid, &cur_mode);
355                 if (error == 0) {
356                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
357                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
358                         if (bcmp(&uuid_uid, &ipdata->uid, sizeof(uuid_uid)) ||
359                             bcmp(&uuid_gid, &ipdata->gid, sizeof(uuid_gid)) ||
360                             ipdata->mode != cur_mode
361                         ) {
362                                 hammer2_chain_modify(&trans, ip->chain, 0);
363                                 ipdata->uid = uuid_uid;
364                                 ipdata->gid = uuid_gid;
365                                 ipdata->mode = cur_mode;
366                                 ipdata->ctime = ctime;
367                         }
368                         kflags |= NOTE_ATTRIB;
369                 }
370         }
371
372         /*
373          * Resize the file
374          */
375         if (vap->va_size != VNOVAL && ipdata->size != vap->va_size) {
376                 switch(vp->v_type) {
377                 case VREG:
378                         if (vap->va_size == ipdata->size)
379                                 break;
380                         if (vap->va_size < ipdata->size) {
381                                 hammer2_truncate_file(&trans, ip, vap->va_size);
382                         } else {
383                                 hammer2_extend_file(&trans, ip, vap->va_size);
384                         }
385                         domtime = 1;
386                         break;
387                 default:
388                         error = EINVAL;
389                         goto done;
390                 }
391         }
392 #if 0
393         /* atime not supported */
394         if (vap->va_atime.tv_sec != VNOVAL) {
395                 hammer2_chain_modify(&trans, ip->chain, 0);
396                 ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
397                 kflags |= NOTE_ATTRIB;
398         }
399 #endif
400         if (vap->va_mtime.tv_sec != VNOVAL) {
401                 hammer2_chain_modify(&trans, ip->chain, 0);
402                 ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
403                 kflags |= NOTE_ATTRIB;
404         }
405         if (vap->va_mode != (mode_t)VNOVAL) {
406                 mode_t cur_mode = ipdata->mode;
407                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
408                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
409
410                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
411                                          cur_uid, cur_gid, &cur_mode);
412                 if (error == 0 && ipdata->mode != cur_mode) {
413                         hammer2_chain_modify(&trans, ip->chain, 0);
414                         ipdata->mode = cur_mode;
415                         ipdata->ctime = ctime;
416                         kflags |= NOTE_ATTRIB;
417                 }
418         }
419 done:
420         hammer2_inode_unlock_ex(ip);
421         hammer2_trans_done(&trans);
422         return (error);
423 }
424
425 static
426 int
427 hammer2_vop_readdir(struct vop_readdir_args *ap)
428 {
429         hammer2_inode_data_t *ipdata;
430         hammer2_mount_t *hmp;
431         hammer2_inode_t *ip;
432         hammer2_inode_t *xip;
433         hammer2_chain_t *parent;
434         hammer2_chain_t *chain;
435         hammer2_tid_t inum;
436         hammer2_key_t lkey;
437         struct uio *uio;
438         off_t *cookies;
439         off_t saveoff;
440         int cookie_index;
441         int ncookies;
442         int error;
443         int dtype;
444         int r;
445
446         ip = VTOI(ap->a_vp);
447         hmp = ip->hmp;
448         uio = ap->a_uio;
449         saveoff = uio->uio_offset;
450
451         /*
452          * Setup cookies directory entry cookies if requested
453          */
454         if (ap->a_ncookies) {
455                 ncookies = uio->uio_resid / 16 + 1;
456                 if (ncookies > 1024)
457                         ncookies = 1024;
458                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
459         } else {
460                 ncookies = -1;
461                 cookies = NULL;
462         }
463         cookie_index = 0;
464
465         hammer2_inode_lock_sh(ip);
466         ipdata = &ip->chain->data->ipdata;
467
468         /*
469          * Handle artificial entries.  To ensure that only positive 64 bit
470          * quantities are returned to userland we always strip off bit 63.
471          * The hash code is designed such that codes 0x0000-0x7FFF are not
472          * used, allowing us to use these codes for articial entries.
473          *
474          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
475          * allow '..' to cross the mount point into (e.g.) the super-root.
476          */
477         error = 0;
478         chain = (void *)(intptr_t)-1;   /* non-NULL for early goto done case */
479
480         if (saveoff == 0) {
481                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
482                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
483                 if (r)
484                         goto done;
485                 if (cookies)
486                         cookies[cookie_index] = saveoff;
487                 ++saveoff;
488                 ++cookie_index;
489                 if (cookie_index == ncookies)
490                         goto done;
491         }
492
493         if (saveoff == 1) {
494                 /*
495                  * Be careful with lockorder when accessing ".."
496                  *
497                  * (ip is the current dir. xip is the parent dir).
498                  */
499                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
500                 while (ip->pip != NULL && ip != ip->pmp->iroot) {
501                         xip = ip->pip;
502                         hammer2_inode_ref(xip);
503                         hammer2_inode_unlock_sh(ip);
504                         hammer2_inode_lock_sh(xip);
505                         hammer2_inode_lock_sh(ip);
506                         hammer2_inode_drop(xip);
507                         if (xip == ip->pip) {
508                                 inum = xip->chain->data->ipdata.inum &
509                                        HAMMER2_DIRHASH_USERMSK;
510                                 hammer2_inode_unlock_sh(xip);
511                                 break;
512                         }
513                         hammer2_inode_unlock_sh(xip);
514                 }
515                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
516                 if (r)
517                         goto done;
518                 if (cookies)
519                         cookies[cookie_index] = saveoff;
520                 ++saveoff;
521                 ++cookie_index;
522                 if (cookie_index == ncookies)
523                         goto done;
524         }
525
526         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
527
528         /*
529          * parent is the inode chain, already locked for us.  Don't
530          * double lock shared locks as this will screw up upgrades.
531          */
532         if (error) {
533                 goto done;
534         }
535         parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
536         chain = hammer2_chain_lookup(&parent, lkey, lkey,
537                                      HAMMER2_LOOKUP_SHARED);
538         if (chain == NULL) {
539                 chain = hammer2_chain_lookup(&parent,
540                                              lkey, (hammer2_key_t)-1,
541                                              HAMMER2_LOOKUP_SHARED);
542         }
543         while (chain) {
544                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
545                         dtype = hammer2_get_dtype(chain);
546                         saveoff = chain->bref.key & HAMMER2_DIRHASH_USERMSK;
547                         r = vop_write_dirent(&error, uio,
548                                              chain->data->ipdata.inum &
549                                               HAMMER2_DIRHASH_USERMSK,
550                                              dtype,
551                                              chain->data->ipdata.name_len,
552                                              chain->data->ipdata.filename);
553                         if (r)
554                                 break;
555                         if (cookies)
556                                 cookies[cookie_index] = saveoff;
557                         ++cookie_index;
558                 } else {
559                         /* XXX chain error */
560                         kprintf("bad chain type readdir %d\n",
561                                 chain->bref.type);
562                 }
563
564                 /*
565                  * Keys may not be returned in order so once we have a
566                  * placemarker (chain) the scan must allow the full range
567                  * or some entries will be missed.
568                  */
569                 chain = hammer2_chain_next(&parent, chain,
570                                            HAMMER2_DIRHASH_VISIBLE,
571                                            (hammer2_key_t)-1,
572                                            HAMMER2_LOOKUP_SHARED);
573                 if (chain) {
574                         saveoff = (chain->bref.key &
575                                    HAMMER2_DIRHASH_USERMSK) + 1;
576                 } else {
577                         saveoff = (hammer2_key_t)-1;
578                 }
579                 if (cookie_index == ncookies)
580                         break;
581         }
582         if (chain)
583                 hammer2_chain_unlock(chain);
584         hammer2_chain_lookup_done(parent);
585 done:
586         hammer2_inode_unlock_sh(ip);
587         if (ap->a_eofflag)
588                 *ap->a_eofflag = (chain == NULL);
589         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
590         if (error && cookie_index == 0) {
591                 if (cookies) {
592                         kfree(cookies, M_TEMP);
593                         *ap->a_ncookies = 0;
594                         *ap->a_cookies = NULL;
595                 }
596         } else {
597                 if (cookies) {
598                         *ap->a_ncookies = cookie_index;
599                         *ap->a_cookies = cookies;
600                 }
601         }
602         return (error);
603 }
604
605 /*
606  * hammer2_vop_readlink { vp, uio, cred }
607  */
608 static
609 int
610 hammer2_vop_readlink(struct vop_readlink_args *ap)
611 {
612         struct vnode *vp;
613         hammer2_mount_t *hmp;
614         hammer2_inode_t *ip;
615         int error;
616
617         vp = ap->a_vp;
618         if (vp->v_type != VLNK)
619                 return (EINVAL);
620         ip = VTOI(vp);
621         hmp = ip->hmp;
622
623         error = hammer2_read_file(ip, ap->a_uio, 0);
624         return (error);
625 }
626
627 static
628 int
629 hammer2_vop_read(struct vop_read_args *ap)
630 {
631         struct vnode *vp;
632         hammer2_mount_t *hmp;
633         hammer2_inode_t *ip;
634         struct uio *uio;
635         int error;
636         int seqcount;
637         int bigread;
638
639         /*
640          * Read operations supported on this vnode?
641          */
642         vp = ap->a_vp;
643         if (vp->v_type != VREG)
644                 return (EINVAL);
645
646         /*
647          * Misc
648          */
649         ip = VTOI(vp);
650         hmp = ip->hmp;
651         uio = ap->a_uio;
652         error = 0;
653
654         seqcount = ap->a_ioflag >> 16;
655         bigread = (uio->uio_resid > 100 * 1024 * 1024);
656
657         error = hammer2_read_file(ip, uio, seqcount);
658         return (error);
659 }
660
661 static
662 int
663 hammer2_vop_write(struct vop_write_args *ap)
664 {
665         hammer2_mount_t *hmp;
666         hammer2_inode_t *ip;
667         thread_t td;
668         struct vnode *vp;
669         struct uio *uio;
670         int error;
671         int seqcount;
672         int bigwrite;
673
674         /*
675          * Read operations supported on this vnode?
676          */
677         vp = ap->a_vp;
678         if (vp->v_type != VREG)
679                 return (EINVAL);
680
681         /*
682          * Misc
683          */
684         ip = VTOI(vp);
685         hmp = ip->hmp;
686         uio = ap->a_uio;
687         error = 0;
688         if (hmp->ronly)
689                 return (EROFS);
690
691         seqcount = ap->a_ioflag >> 16;
692         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
693
694         /*
695          * Check resource limit
696          */
697         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
698             uio->uio_offset + uio->uio_resid >
699              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
700                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
701                 return (EFBIG);
702         }
703
704         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
705
706         /*
707          * ip must be locked if extending the file.
708          * ip must be locked to avoid racing a truncation.
709          *
710          * ip must be marked modified, particularly because the write
711          * might wind up being copied into the embedded data area.
712          */
713         hammer2_inode_lock_ex(ip);
714         error = hammer2_write_file(ip, uio, ap->a_ioflag, seqcount);
715         hammer2_inode_unlock_ex(ip);
716         return (error);
717 }
718
719 /*
720  * Perform read operations on a file or symlink given an UNLOCKED
721  * inode and uio.
722  *
723  * The passed ip is not locked.
724  */
725 static
726 int
727 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
728 {
729         hammer2_off_t size;
730         struct buf *bp;
731         int error;
732
733         error = 0;
734
735         /*
736          * UIO read loop.
737          */
738         hammer2_inode_lock_sh(ip);
739         size = ip->chain->data->ipdata.size;
740
741         while (uio->uio_resid > 0 && uio->uio_offset < size) {
742                 hammer2_key_t lbase;
743                 hammer2_key_t leof;
744                 int lblksize;
745                 int loff;
746                 int n;
747
748                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
749                                                 &lbase, &leof);
750
751                 error = cluster_read(ip->vp, leof, lbase, lblksize,
752                                      uio->uio_resid, seqcount * BKVASIZE,
753                                      &bp);
754
755                 if (error)
756                         break;
757                 loff = (int)(uio->uio_offset - lbase);
758                 n = lblksize - loff;
759                 if (n > uio->uio_resid)
760                         n = uio->uio_resid;
761                 if (n > size - uio->uio_offset)
762                         n = (int)(size - uio->uio_offset);
763                 bp->b_flags |= B_AGE;
764                 uiomove((char *)bp->b_data + loff, n, uio);
765                 bqrelse(bp);
766         }
767         hammer2_inode_unlock_sh(ip);
768         return (error);
769 }
770
771 /*
772  * Called with a locked (ip) to do the underlying write to a file or
773  * to build the symlink target.
774  */
775 static
776 int
777 hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
778                    int ioflag, int seqcount)
779 {
780         hammer2_trans_t trans;
781         hammer2_inode_data_t *ipdata;
782         hammer2_key_t old_eof;
783         struct buf *bp;
784         int kflags;
785         int error;
786         int modified = 0;
787
788         /*
789          * Setup if append
790          */
791         ipdata = &ip->chain->data->ipdata;
792         if (ioflag & IO_APPEND)
793                 uio->uio_offset = ipdata->size;
794         kflags = 0;
795         error = 0;
796
797         hammer2_trans_init(&trans, ip->hmp);
798
799         /*
800          * Extend the file if necessary.  If the write fails at some point
801          * we will truncate it back down to cover as much as we were able
802          * to write.
803          *
804          * Doing this now makes it easier to calculate buffer sizes in
805          * the loop.
806          */
807         old_eof = ipdata->size;
808         if (uio->uio_offset + uio->uio_resid > ipdata->size) {
809                 modified = 1;
810                 hammer2_extend_file(&trans, ip,
811                                     uio->uio_offset + uio->uio_resid);
812                 kflags |= NOTE_EXTEND;
813         }
814
815         /*
816          * UIO write loop
817          */
818         while (uio->uio_resid > 0) {
819                 hammer2_key_t lbase;
820                 hammer2_key_t leof;
821                 int trivial;
822                 int lblksize;
823                 int loff;
824                 int n;
825
826                 /*
827                  * Don't allow the buffer build to blow out the buffer
828                  * cache.
829                  */
830                 if ((ioflag & IO_RECURSE) == 0) {
831                         /*
832                          * XXX should try to leave this unlocked through
833                          *      the whole loop
834                          */
835                         hammer2_inode_unlock_ex(ip);
836                         bwillwrite(HAMMER2_PBUFSIZE);
837                         hammer2_inode_lock_ex(ip);
838                         ipdata = &ip->chain->data->ipdata;      /* reload */
839                 }
840
841                 /* XXX bigwrite & signal check test */
842
843                 /*
844                  * This nominally tells us how much we can cluster and
845                  * what the logical buffer size needs to be.  Currently
846                  * we don't try to cluster the write and just handle one
847                  * block at a time.
848                  */
849                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
850                                                 &lbase, &leof);
851                 loff = (int)(uio->uio_offset - lbase);
852
853                 /*
854                  * Calculate bytes to copy this transfer and whether the
855                  * copy completely covers the buffer or not.
856                  */
857                 trivial = 0;
858                 n = lblksize - loff;
859                 if (n > uio->uio_resid) {
860                         n = uio->uio_resid;
861                         if (uio->uio_offset + n == ipdata->size)
862                                 trivial = 1;
863                 } else if (loff == 0) {
864                         trivial = 1;
865                 }
866
867                 /*
868                  * Get the buffer
869                  */
870                 if (uio->uio_segflg == UIO_NOCOPY) {
871                         /*
872                          * Issuing a write with the same data backing the
873                          * buffer.  Instantiate the buffer to collect the
874                          * backing vm pages, then read-in any missing bits.
875                          *
876                          * This case is used by vop_stdputpages().
877                          */
878                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
879                         if ((bp->b_flags & B_CACHE) == 0) {
880                                 bqrelse(bp);
881                                 error = bread(ip->vp, lbase, lblksize, &bp);
882                         }
883                 } else if (trivial) {
884                         /*
885                          * Even though we are entirely overwriting the buffer
886                          * we may still have to zero it out to avoid a
887                          * mmap/write visibility issue.
888                          */
889                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
890                         if ((bp->b_flags & B_CACHE) == 0)
891                                 vfs_bio_clrbuf(bp);
892                 } else {
893                         /*
894                          * Partial overwrite, read in any missing bits then
895                          * replace the portion being written.
896                          *
897                          * (The strategy code will detect zero-fill physical
898                          * blocks for this case).
899                          */
900                         error = bread(ip->vp, lbase, lblksize, &bp);
901                         if (error == 0)
902                                 bheavy(bp);
903                 }
904
905                 if (error) {
906                         brelse(bp);
907                         break;
908                 }
909
910                 /*
911                  * We have to assign physical storage to the buffer we intend
912                  * to dirty or write now to avoid deadlocks in the strategy
913                  * code later.
914                  *
915                  * This can return NOOFFSET for inode-embedded data.  The
916                  * strategy code will take care of it in that case.
917                  */
918                 bp->b_bio2.bio_offset =
919                         hammer2_assign_physical(&trans, ip,
920                                                 lbase, lblksize, &error);
921                 if (error) {
922                         brelse(bp);
923                         break;
924                 }
925
926                 /*
927                  * Ok, copy the data in
928                  */
929                 hammer2_inode_unlock_ex(ip);
930                 error = uiomove(bp->b_data + loff, n, uio);
931                 hammer2_inode_lock_ex(ip);
932                 ipdata = &ip->chain->data->ipdata;      /* reload */
933                 kflags |= NOTE_WRITE;
934                 modified = 1;
935
936                 if (error) {
937                         brelse(bp);
938                         break;
939                 }
940
941                 /* XXX update ip_data.mtime */
942
943                 /*
944                  * Once we dirty a buffer any cached offset becomes invalid.
945                  *
946                  * NOTE: For cluster_write() always use the trailing block
947                  *       size, which is HAMMER2_PBUFSIZE.  lblksize is the
948                  *       eof-straddling blocksize and is incorrect.
949                  */
950                 bp->b_flags |= B_AGE;
951                 if (ioflag & IO_SYNC) {
952                         bwrite(bp);
953                 } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
954                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
955                                 bp->b_flags |= B_CLUSTEROK;
956                         bdwrite(bp);
957                 } else if (ioflag & IO_ASYNC) {
958                         bawrite(bp);
959                 } else if (hammer2_cluster_enable) {
960                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
961                                 bp->b_flags |= B_CLUSTEROK;
962                         cluster_write(bp, leof, HAMMER2_PBUFSIZE, seqcount);
963                 } else {
964                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
965                                 bp->b_flags |= B_CLUSTEROK;
966                         bdwrite(bp);
967                 }
968         }
969
970         /*
971          * Cleanup.  If we extended the file EOF but failed to write through
972          * the entire write is a failure and we have to back-up.
973          */
974         if (error && ipdata->size != old_eof) {
975                 hammer2_truncate_file(&trans, ip, old_eof);
976         } else if (modified) {
977                 hammer2_chain_modify(&trans, ip->chain, 0);
978                 hammer2_update_time(&ipdata->mtime);
979         }
980         hammer2_knote(ip->vp, kflags);
981         hammer2_trans_done(&trans);
982         return error;
983 }
984
985 /*
986  * Assign physical storage to a logical block.
987  *
988  * NOOFFSET is returned if the data is inode-embedded.  In this case the
989  * strategy code will simply bcopy() the data into the inode.
990  *
991  * The inode's delta_dcount is adjusted.
992  */
993 static
994 hammer2_off_t
995 hammer2_assign_physical(hammer2_trans_t *trans, hammer2_inode_t *ip,
996                         hammer2_key_t lbase, int lblksize, int *errorp)
997 {
998         hammer2_mount_t *hmp;
999         hammer2_chain_t *parent;
1000         hammer2_chain_t *chain;
1001         hammer2_off_t pbase;
1002
1003         /*
1004          * Locate the chain associated with lbase, return a locked chain.
1005          * However, do not instantiate any data reference (which utilizes a
1006          * device buffer) because we will be using direct IO via the
1007          * logical buffer cache buffer.
1008          */
1009         hmp = ip->hmp;
1010         *errorp = 0;
1011 retry:
1012         hammer2_inode_lock_ex(ip);
1013         parent = hammer2_chain_lookup_init(ip->chain, 0);
1014         chain = hammer2_chain_lookup(&parent,
1015                                      lbase, lbase,
1016                                      HAMMER2_LOOKUP_NODATA);
1017
1018         if (chain == NULL) {
1019                 /*
1020                  * We found a hole, create a new chain entry.
1021                  *
1022                  * NOTE: DATA chains are created without device backing
1023                  *       store (nor do we want any).
1024                  */
1025                 *errorp = hammer2_chain_create(trans, parent, &chain,
1026                                                lbase, HAMMER2_PBUFRADIX,
1027                                                HAMMER2_BREF_TYPE_DATA,
1028                                                lblksize);
1029                 if (chain == NULL) {
1030                         hammer2_inode_unlock_ex(ip);
1031                         hammer2_chain_lookup_done(parent);
1032                         panic("hammer2_chain_create: par=%p error=%d\n",
1033                                 parent, *errorp);
1034                         goto retry;
1035                 }
1036
1037                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1038                 /*ip->delta_dcount += lblksize;*/
1039         } else {
1040                 switch (chain->bref.type) {
1041                 case HAMMER2_BREF_TYPE_INODE:
1042                         /*
1043                          * The data is embedded in the inode.  The
1044                          * caller is responsible for marking the inode
1045                          * modified and copying the data to the embedded
1046                          * area.
1047                          */
1048                         pbase = NOOFFSET;
1049                         break;
1050                 case HAMMER2_BREF_TYPE_DATA:
1051                         if (chain->bytes != lblksize) {
1052                                 panic("hammer2_assign_physical: "
1053                                       "size mismatch %d/%d\n",
1054                                       lblksize, chain->bytes);
1055                         }
1056                         hammer2_chain_modify(trans, chain,
1057                                              HAMMER2_MODIFY_OPTDATA);
1058                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1059                         break;
1060                 default:
1061                         panic("hammer2_assign_physical: bad type");
1062                         /* NOT REACHED */
1063                         pbase = NOOFFSET;
1064                         break;
1065                 }
1066         }
1067         if (chain)
1068                 hammer2_chain_unlock(chain);
1069         hammer2_chain_lookup_done(parent);
1070
1071         hammer2_inode_unlock_ex(ip);
1072
1073         return (pbase);
1074 }
1075
1076 /*
1077  * Truncate the size of a file.
1078  *
1079  * This routine adjusts ipdata->size smaller, destroying any related
1080  * data beyond the new EOF and potentially resizing the block straddling
1081  * the EOF.
1082  *
1083  * The inode must be locked.
1084  */
1085 static
1086 void
1087 hammer2_truncate_file(hammer2_trans_t *trans,
1088                       hammer2_inode_t *ip, hammer2_key_t nsize)
1089 {
1090         hammer2_inode_data_t *ipdata;
1091         hammer2_chain_t *parent;
1092         hammer2_chain_t *chain;
1093         hammer2_key_t lbase;
1094         hammer2_key_t leof;
1095         struct buf *bp;
1096         int loff;
1097         int error;
1098         int oblksize;
1099         int nblksize;
1100
1101         hammer2_chain_modify(trans, ip->chain, 0);
1102         bp = NULL;
1103         ipdata = &ip->chain->data->ipdata;
1104         error = 0;
1105
1106         /*
1107          * Destroy any logical buffer cache buffers beyond the file EOF.
1108          *
1109          * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
1110          * around with the buffer straddling EOF, because we need to assign
1111          * a new physical offset to it.
1112          */
1113         if (ip->vp) {
1114                 nvtruncbuf(ip->vp, nsize,
1115                            HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
1116                            1);
1117         }
1118
1119         /*
1120          * Setup for lookup/search
1121          */
1122         parent = hammer2_chain_lookup_init(ip->chain, 0);
1123
1124         /*
1125          * Handle the case where a chain/logical-buffer straddles the new
1126          * EOF.  We told nvtruncbuf() above not to mess with the logical
1127          * buffer straddling the EOF because we need to reassign its storage
1128          * and can't let the strategy code do it for us.
1129          */
1130         loff = (int)nsize & HAMMER2_PBUFMASK;
1131         if (loff && ip->vp) {
1132                 oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1133                 error = bread(ip->vp, lbase, oblksize, &bp);
1134                 KKASSERT(error == 0);
1135         }
1136         ipdata->size = nsize;
1137         nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1138
1139         /*
1140          * Fixup the chain element.  If we have a logical buffer in-hand
1141          * we don't want to create a conflicting device buffer.
1142          */
1143         if (loff && bp) {
1144                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
1145                                              HAMMER2_LOOKUP_NODATA);
1146                 if (chain) {
1147                         switch(chain->bref.type) {
1148                         case HAMMER2_BREF_TYPE_DATA:
1149                                 hammer2_chain_resize(trans, ip, bp,
1150                                              parent, &chain,
1151                                              hammer2_allocsize(nblksize),
1152                                              HAMMER2_MODIFY_OPTDATA);
1153                                 allocbuf(bp, nblksize);
1154                                 bzero(bp->b_data + loff, nblksize - loff);
1155                                 bp->b_bio2.bio_offset = chain->bref.data_off &
1156                                                         HAMMER2_OFF_MASK;
1157                                 break;
1158                         case HAMMER2_BREF_TYPE_INODE:
1159                                 allocbuf(bp, nblksize);
1160                                 bzero(bp->b_data + loff, nblksize - loff);
1161                                 bp->b_bio2.bio_offset = NOOFFSET;
1162                                 break;
1163                         default:
1164                                 panic("hammer2_truncate_file: bad type");
1165                                 break;
1166                         }
1167                         hammer2_chain_unlock(chain);
1168                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1169                                 bp->b_flags |= B_CLUSTEROK;
1170                         bdwrite(bp);
1171                 } else {
1172                         /*
1173                          * Destroy clean buffer w/ wrong buffer size.  Retain
1174                          * backing store.
1175                          */
1176                         bp->b_flags |= B_RELBUF;
1177                         KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
1178                         KKASSERT((bp->b_flags & B_DIRTY) == 0);
1179                         bqrelse(bp);
1180                 }
1181         } else if (loff) {
1182                 /*
1183                  * WARNING: This utilizes a device buffer for the data.
1184                  *
1185                  * This case should not occur because file truncations without
1186                  * a vnode (and hence no logical buffer cache) should only
1187                  * always truncate to 0-length.
1188                  */
1189                 panic("hammer2_truncate_file: non-zero truncation, no-vnode");
1190 #if 0
1191                 chain = hammer2_chain_lookup(&parent, lbase, lbase, 0);
1192                 if (chain) {
1193                         switch(chain->bref.type) {
1194                         case HAMMER2_BREF_TYPE_DATA:
1195                                 chain = hammer2_chain_resize(trans, ip, bp,
1196                                              parent, chain,
1197                                              hammer2_allocsize(nblksize),
1198                                              0);
1199                                 hammer2_chain_modify(hmp, chain, 0);
1200                                 bzero(chain->data->buf + loff, nblksize - loff);
1201                                 break;
1202                         case HAMMER2_BREF_TYPE_INODE:
1203                                 if (loff < HAMMER2_EMBEDDED_BYTES) {
1204                                         hammer2_chain_modify(hmp, chain, 0);
1205                                         bzero(chain->data->ipdata.u.data + loff,
1206                                               HAMMER2_EMBEDDED_BYTES - loff);
1207                                 }
1208                                 break;
1209                         }
1210                         hammer2_chain_unlock(chain);
1211                 }
1212 #endif
1213         }
1214
1215         /*
1216          * Clean up any fragmentory VM pages now that we have properly
1217          * resized the straddling buffer.  These pages are no longer
1218          * part of the buffer.
1219          */
1220         if (ip->vp) {
1221                 nvtruncbuf(ip->vp, nsize,
1222                            nblksize, (int)nsize & (nblksize - 1),
1223                            1);
1224         }
1225
1226         /*
1227          * Destroy any physical blocks after the new EOF point.
1228          */
1229         lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
1230         chain = hammer2_chain_lookup(&parent,
1231                                      lbase, (hammer2_key_t)-1,
1232                                      HAMMER2_LOOKUP_NODATA);
1233         while (chain) {
1234                 /*
1235                  * Degenerate embedded data case, nothing to loop on.
1236                  */
1237                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1238                         hammer2_chain_unlock(chain);
1239                         break;
1240                 }
1241
1242                 /*
1243                  * Delete physical data blocks past the file EOF.
1244                  */
1245                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1246                         /*ip->delta_dcount -= chain->bytes;*/
1247                         hammer2_chain_delete(trans, parent, chain);
1248                 }
1249                 /* XXX check parent if empty indirect block & delete */
1250                 chain = hammer2_chain_next(&parent, chain,
1251                                            lbase, (hammer2_key_t)-1,
1252                                            HAMMER2_LOOKUP_NODATA);
1253         }
1254         hammer2_chain_lookup_done(parent);
1255 }
1256
1257 /*
1258  * Extend the size of a file.  The inode must be locked.
1259  *
1260  * We may have to resize the block straddling the old EOF.
1261  */
1262 static
1263 void
1264 hammer2_extend_file(hammer2_trans_t *trans,
1265                     hammer2_inode_t *ip, hammer2_key_t nsize)
1266 {
1267         hammer2_inode_data_t *ipdata;
1268         hammer2_mount_t *hmp;
1269         hammer2_chain_t *parent;
1270         hammer2_chain_t *chain;
1271         struct buf *bp;
1272         hammer2_key_t osize;
1273         hammer2_key_t obase;
1274         hammer2_key_t nbase;
1275         hammer2_key_t leof;
1276         int oblksize;
1277         int nblksize;
1278         int nradix;
1279         int error;
1280
1281         KKASSERT(ip->vp);
1282         hmp = ip->hmp;
1283
1284         hammer2_chain_modify(trans, ip->chain, 0);
1285         ipdata = &ip->chain->data->ipdata;
1286
1287         /*
1288          * Nothing to do if the direct-data case is still intact
1289          */
1290         if ((ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1291             nsize <= HAMMER2_EMBEDDED_BYTES) {
1292                 ipdata->size = nsize;
1293                 nvextendbuf(ip->vp,
1294                             ipdata->size, nsize,
1295                             0, HAMMER2_EMBEDDED_BYTES,
1296                             0, (int)nsize,
1297                             1);
1298                 return;
1299         }
1300
1301         /*
1302          * Calculate the blocksize at the original EOF and resize the block
1303          * if necessary.  Adjust the file size in the inode.
1304          */
1305         osize = ipdata->size;
1306         oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
1307         ipdata->size = nsize;
1308         nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
1309
1310         /*
1311          * Do all required vnode operations, but do not mess with the
1312          * buffer straddling the orignal EOF.
1313          */
1314         nvextendbuf(ip->vp,
1315                     ipdata->size, nsize,
1316                     0, nblksize,
1317                     0, (int)nsize & HAMMER2_PBUFMASK,
1318                     1);
1319
1320         /*
1321          * Early return if we have no more work to do.
1322          */
1323         if (obase == nbase && oblksize == nblksize &&
1324             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1325                 return;
1326         }
1327
1328         /*
1329          * We have work to do, including possibly resizing the buffer
1330          * at the previous EOF point and turning off DIRECTDATA mode.
1331          */
1332         bp = NULL;
1333         if (((int)osize & HAMMER2_PBUFMASK)) {
1334                 error = bread(ip->vp, obase, oblksize, &bp);
1335                 KKASSERT(error == 0);
1336         }
1337
1338         /*
1339          * Disable direct-data mode by loading up a buffer cache buffer
1340          * with the data, then converting the inode data area into the
1341          * inode indirect block array area.
1342          */
1343         if (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1344                 ipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1345                 bzero(&ipdata->u.blockset, sizeof(ipdata->u.blockset));
1346         }
1347
1348         /*
1349          * Resize the chain element at the old EOF.
1350          */
1351         if (((int)osize & HAMMER2_PBUFMASK)) {
1352 retry:
1353                 error = 0;
1354                 parent = hammer2_chain_lookup_init(ip->chain, 0);
1355                 nradix = hammer2_allocsize(nblksize);
1356
1357                 chain = hammer2_chain_lookup(&parent,
1358                                              obase, obase,
1359                                              HAMMER2_LOOKUP_NODATA);
1360                 if (chain == NULL) {
1361                         error = hammer2_chain_create(trans, parent, &chain,
1362                                                      obase, nblksize,
1363                                                      HAMMER2_BREF_TYPE_DATA,
1364                                                      nblksize);
1365                         if (chain == NULL) {
1366                                 hammer2_chain_lookup_done(parent);
1367                                 panic("hammer2_chain_create: par=%p error=%d\n",
1368                                         parent, error);
1369                                 goto retry;
1370                         }
1371                         /*ip->delta_dcount += nblksize;*/
1372                 } else {
1373                         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1374                         hammer2_chain_resize(trans, ip, bp,
1375                                              parent, &chain,
1376                                              nradix,
1377                                              HAMMER2_MODIFY_OPTDATA);
1378                 }
1379                 if (obase != nbase) {
1380                         if (oblksize != HAMMER2_PBUFSIZE)
1381                                 allocbuf(bp, HAMMER2_PBUFSIZE);
1382                 } else {
1383                         if (oblksize != nblksize)
1384                                 allocbuf(bp, nblksize);
1385                 }
1386                 bp->b_bio2.bio_offset = chain->bref.data_off &
1387                                         HAMMER2_OFF_MASK;
1388                 hammer2_chain_unlock(chain);
1389                 if (bp->b_bcount == HAMMER2_PBUFSIZE)
1390                         bp->b_flags |= B_CLUSTEROK;
1391                 bdwrite(bp);
1392                 hammer2_chain_lookup_done(parent);  /* must be after bdwrite */
1393         }
1394 }
1395
1396 static
1397 int
1398 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1399 {
1400         hammer2_inode_t *ip;
1401         hammer2_inode_t *dip;
1402         hammer2_mount_t *hmp;
1403         hammer2_chain_t *parent;
1404         hammer2_chain_t *chain;
1405         hammer2_chain_t *ochain;
1406         hammer2_trans_t trans;
1407         struct namecache *ncp;
1408         const uint8_t *name;
1409         size_t name_len;
1410         hammer2_key_t lhc;
1411         int error = 0;
1412         struct vnode *vp;
1413
1414         dip = VTOI(ap->a_dvp);
1415         hmp = dip->hmp;
1416         ncp = ap->a_nch->ncp;
1417         name = ncp->nc_name;
1418         name_len = ncp->nc_nlen;
1419         lhc = hammer2_dirhash(name, name_len);
1420
1421         /*
1422          * Note: In DragonFly the kernel handles '.' and '..'.
1423          */
1424         hammer2_inode_lock_sh(dip);
1425         parent = hammer2_chain_lookup_init(dip->chain, HAMMER2_LOOKUP_SHARED);
1426         chain = hammer2_chain_lookup(&parent,
1427                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1428                                      HAMMER2_LOOKUP_SHARED);
1429         while (chain) {
1430                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1431                     name_len == chain->data->ipdata.name_len &&
1432                     bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1433                         break;
1434                 }
1435                 chain = hammer2_chain_next(&parent, chain,
1436                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1437                                            HAMMER2_LOOKUP_SHARED);
1438         }
1439         hammer2_chain_lookup_done(parent);
1440         hammer2_inode_unlock_sh(dip);
1441
1442         /*
1443          * If the inode represents a forwarding entry for a hardlink we have
1444          * to locate the actual inode.  The original ip is saved for possible
1445          * deconsolidation.  (ip) will only be set to non-NULL when we have
1446          * to locate the real file via a hardlink.  ip will be referenced but
1447          * not locked in that situation.  chain is passed in locked and
1448          * returned locked.
1449          *
1450          * XXX what kind of chain lock?
1451          */
1452         ochain = NULL;
1453         if (chain && chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
1454                 error = hammer2_hardlink_find(dip, &chain, &ochain);
1455                 if (error) {
1456                         kprintf("hammer2: unable to find hardlink\n");
1457                         if (chain) {
1458                                 hammer2_chain_unlock(chain);
1459                                 chain = NULL;
1460                         }
1461                         goto failed;
1462                 }
1463         }
1464
1465         /*
1466          * Deconsolidate any hardlink whos nlinks == 1.  Ignore errors.
1467          * If an error occurs chain and ip are left alone.
1468          *
1469          * XXX upgrade shared lock?
1470          */
1471         if (ochain && chain && chain->data->ipdata.nlinks == 1 && !hmp->ronly) {
1472                 kprintf("hammer2: need to unconsolidate hardlink for %s\n",
1473                         chain->data->ipdata.filename);
1474                 /* XXX retain shared lock on dip? (currently not held) */
1475                 hammer2_trans_init(&trans, dip->hmp);
1476                 hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain);
1477                 hammer2_trans_done(&trans);
1478         }
1479
1480         /*
1481          * Acquire the related vnode
1482          *
1483          * NOTE: For error processing, only ENOENT resolves the namecache
1484          *       entry to NULL, otherwise we just return the error and
1485          *       leave the namecache unresolved.
1486          *
1487          * NOTE: multiple hammer2_inode structures can be aliased to the
1488          *       same chain element, for example for hardlinks.  This
1489          *       use case does not 'reattach' inode associations that
1490          *       might already exist, but always allocates a new one.
1491          *
1492          * WARNING: inode structure is locked exclusively via inode_get
1493          *          but chain was locked shared.  inode_unlock_ex()
1494          *          will handle it properly.
1495          */
1496         if (chain) {
1497                 ip = hammer2_inode_get(hmp, dip->pmp, dip, chain);
1498                 vp = hammer2_igetv(ip, &error);
1499                 if (error == 0) {
1500                         vn_unlock(vp);
1501                         cache_setvp(ap->a_nch, vp);
1502                 } else if (error == ENOENT) {
1503                         cache_setvp(ap->a_nch, NULL);
1504                 }
1505                 hammer2_inode_unlock_ex(ip);
1506
1507                 /*
1508                  * The vp should not be released until after we've disposed
1509                  * of our locks, because it might cause vop_inactive() to
1510                  * be called.
1511                  */
1512                 if (vp)
1513                         vrele(vp);
1514         } else {
1515                 error = ENOENT;
1516                 cache_setvp(ap->a_nch, NULL);
1517         }
1518 failed:
1519         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1520                 ("resolve error %d/%p chain %p ap %p\n",
1521                  error, ap->a_nch->ncp->nc_vp, chain, ap));
1522         if (ochain)
1523                 hammer2_chain_drop(ochain);
1524         return error;
1525 }
1526
1527 static
1528 int
1529 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1530 {
1531         hammer2_inode_t *dip;
1532         hammer2_inode_t *ip;
1533         hammer2_mount_t *hmp;
1534         int error;
1535
1536         dip = VTOI(ap->a_dvp);
1537         hmp = dip->hmp;
1538
1539         if ((ip = dip->pip) == NULL) {
1540                 *ap->a_vpp = NULL;
1541                 return ENOENT;
1542         }
1543         hammer2_inode_lock_ex(ip);
1544         *ap->a_vpp = hammer2_igetv(ip, &error);
1545         hammer2_inode_unlock_ex(ip);
1546
1547         return error;
1548 }
1549
1550 static
1551 int
1552 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1553 {
1554         hammer2_mount_t *hmp;
1555         hammer2_inode_t *dip;
1556         hammer2_inode_t *nip;
1557         hammer2_trans_t trans;
1558         struct namecache *ncp;
1559         const uint8_t *name;
1560         size_t name_len;
1561         int error;
1562
1563         dip = VTOI(ap->a_dvp);
1564         hmp = dip->hmp;
1565         if (hmp->ronly)
1566                 return (EROFS);
1567
1568         ncp = ap->a_nch->ncp;
1569         name = ncp->nc_name;
1570         name_len = ncp->nc_nlen;
1571
1572         hammer2_trans_init(&trans, hmp);
1573         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1574                                    name, name_len, &error);
1575         if (error) {
1576                 KKASSERT(nip == NULL);
1577                 *ap->a_vpp = NULL;
1578         } else {
1579                 *ap->a_vpp = hammer2_igetv(nip, &error);
1580                 hammer2_inode_unlock_ex(nip);
1581         }
1582         hammer2_trans_done(&trans);
1583
1584         if (error == 0) {
1585                 cache_setunresolved(ap->a_nch);
1586                 cache_setvp(ap->a_nch, *ap->a_vpp);
1587         }
1588         return error;
1589 }
1590
1591 /*
1592  * Return the largest contiguous physical disk range for the logical
1593  * request.
1594  *
1595  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
1596  */
1597 static
1598 int
1599 hammer2_vop_bmap(struct vop_bmap_args *ap)
1600 {
1601         struct vnode *vp;
1602         hammer2_mount_t *hmp;
1603         hammer2_inode_t *ip;
1604         hammer2_chain_t *parent;
1605         hammer2_chain_t *chain;
1606         hammer2_key_t lbeg;
1607         hammer2_key_t lend;
1608         hammer2_off_t pbeg;
1609         hammer2_off_t pbytes;
1610         hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
1611         int loff;
1612         int ai;
1613
1614         /*
1615          * Only supported on regular files
1616          *
1617          * Only supported for read operations (required for cluster_read).
1618          * The block allocation is delayed for write operations.
1619          */
1620         vp = ap->a_vp;
1621         if (vp->v_type != VREG)
1622                 return (EOPNOTSUPP);
1623         if (ap->a_cmd != BUF_CMD_READ)
1624                 return (EOPNOTSUPP);
1625
1626         ip = VTOI(vp);
1627         hmp = ip->hmp;
1628         bzero(array, sizeof(array));
1629
1630         /*
1631          * Calculate logical range
1632          */
1633         KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
1634         lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
1635         lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
1636         if (lend < lbeg)
1637                 lend = lbeg;
1638         loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
1639
1640         hammer2_inode_lock_sh(ip);
1641         parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
1642         chain = hammer2_chain_lookup(&parent,
1643                                      lbeg, lend,
1644                                      HAMMER2_LOOKUP_NODATA |
1645                                      HAMMER2_LOOKUP_SHARED);
1646         if (chain == NULL) {
1647                 *ap->a_doffsetp = ZFOFFSET;
1648                 hammer2_chain_lookup_done(parent);
1649                 hammer2_inode_unlock_sh(ip);
1650                 return (0);
1651         }
1652
1653         while (chain) {
1654                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1655                         ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
1656                         KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
1657                         array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
1658                         array[ai][1] = chain->bytes;
1659                 }
1660                 chain = hammer2_chain_next(&parent, chain,
1661                                            lbeg, lend,
1662                                            HAMMER2_LOOKUP_NODATA |
1663                                            HAMMER2_LOOKUP_SHARED);
1664         }
1665         hammer2_chain_lookup_done(parent);
1666         hammer2_inode_unlock_sh(ip);
1667
1668         /*
1669          * If the requested loffset is not mappable physically we can't
1670          * bmap.  The caller will have to access the file data via a
1671          * device buffer.
1672          */
1673         if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_LBUFSIZE) {
1674                 *ap->a_doffsetp = NOOFFSET;
1675                 return (0);
1676         }
1677
1678         /*
1679          * Calculate the physical disk offset range for array[0]
1680          */
1681         pbeg = array[0][0] + loff;
1682         pbytes = array[0][1] - loff;
1683
1684         for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
1685                 if (array[ai][0] != pbeg + pbytes)
1686                         break;
1687                 pbytes += array[ai][1];
1688         }
1689
1690         *ap->a_doffsetp = pbeg;
1691         if (ap->a_runp)
1692                 *ap->a_runp = pbytes;
1693         return (0);
1694 }
1695
1696 static
1697 int
1698 hammer2_vop_open(struct vop_open_args *ap)
1699 {
1700         return vop_stdopen(ap);
1701 }
1702
1703 /*
1704  * hammer2_vop_advlock { vp, id, op, fl, flags }
1705  */
1706 static
1707 int
1708 hammer2_vop_advlock(struct vop_advlock_args *ap)
1709 {
1710         hammer2_inode_t *ip = VTOI(ap->a_vp);
1711         hammer2_off_t size;
1712
1713         hammer2_inode_lock_sh(ip);
1714         size = ip->chain->data->ipdata.size;
1715         hammer2_inode_unlock_sh(ip);
1716         return (lf_advlock(ap, &ip->advlock, size));
1717 }
1718
1719
1720 static
1721 int
1722 hammer2_vop_close(struct vop_close_args *ap)
1723 {
1724         return vop_stdclose(ap);
1725 }
1726
1727 /*
1728  * hammer2_vop_nlink { nch, dvp, vp, cred }
1729  *
1730  * Create a hardlink from (vp) to {dvp, nch}.
1731  */
1732 static
1733 int
1734 hammer2_vop_nlink(struct vop_nlink_args *ap)
1735 {
1736         hammer2_inode_t *dip;   /* target directory to create link in */
1737         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1738         hammer2_mount_t *hmp;
1739         hammer2_chain_t *chain;
1740         hammer2_trans_t trans;
1741         struct namecache *ncp;
1742         const uint8_t *name;
1743         size_t name_len;
1744         int error;
1745
1746         dip = VTOI(ap->a_dvp);
1747         hmp = dip->hmp;
1748         if (hmp->ronly)
1749                 return (EROFS);
1750
1751         ncp = ap->a_nch->ncp;
1752         name = ncp->nc_name;
1753         name_len = ncp->nc_nlen;
1754         hammer2_trans_init(&trans, hmp);
1755
1756         /*
1757          * ip represents the file being hardlinked.  The file could be a
1758          * normal file or a hardlink target if it has already been hardlinked.
1759          * If ip is a hardlinked target then ip->pip represents the location
1760          * of the hardlinked target, NOT the location of the hardlink pointer.
1761          *
1762          * Bump nlinks and potentially also create or move the hardlink
1763          * target in the parent directory common to (ip) and (dip).  The
1764          * consolidation code can modify ip->chain and ip->pip.  The
1765          * returned chain is locked.
1766          */
1767         ip = VTOI(ap->a_vp);
1768         hammer2_inode_ref(ip);
1769         error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1);
1770         if (error)
1771                 goto done;
1772
1773         /*
1774          * Create a directory entry connected to the specified chain.
1775          * This function unlocks and NULL's chain on return.
1776          */
1777         error = hammer2_inode_connect(&trans, dip, ip, &chain, name, name_len);
1778         if (chain) {
1779                 hammer2_chain_unlock(chain);
1780                 chain = NULL;
1781         }
1782         if (error == 0) {
1783                 cache_setunresolved(ap->a_nch);
1784                 cache_setvp(ap->a_nch, ap->a_vp);
1785         }
1786 done:
1787         hammer2_inode_drop(ip);
1788         hammer2_trans_done(&trans);
1789
1790         return error;
1791 }
1792
1793 /*
1794  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1795  *
1796  * The operating system has already ensured that the directory entry
1797  * does not exist and done all appropriate namespace locking.
1798  */
1799 static
1800 int
1801 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1802 {
1803         hammer2_mount_t *hmp;
1804         hammer2_inode_t *dip;
1805         hammer2_inode_t *nip;
1806         hammer2_trans_t trans;
1807         struct namecache *ncp;
1808         const uint8_t *name;
1809         size_t name_len;
1810         int error;
1811
1812         dip = VTOI(ap->a_dvp);
1813         hmp = dip->hmp;
1814         if (hmp->ronly)
1815                 return (EROFS);
1816
1817         ncp = ap->a_nch->ncp;
1818         name = ncp->nc_name;
1819         name_len = ncp->nc_nlen;
1820         hammer2_trans_init(&trans, hmp);
1821
1822         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1823                                    name, name_len, &error);
1824         if (error) {
1825                 KKASSERT(nip == NULL);
1826                 *ap->a_vpp = NULL;
1827         } else {
1828                 *ap->a_vpp = hammer2_igetv(nip, &error);
1829                 hammer2_inode_unlock_ex(nip);
1830         }
1831         hammer2_trans_done(&trans);
1832
1833         if (error == 0) {
1834                 cache_setunresolved(ap->a_nch);
1835                 cache_setvp(ap->a_nch, *ap->a_vpp);
1836         }
1837         return error;
1838 }
1839
1840 /*
1841  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1842  */
1843 static
1844 int
1845 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1846 {
1847         hammer2_mount_t *hmp;
1848         hammer2_inode_t *dip;
1849         hammer2_inode_t *nip;
1850         hammer2_trans_t trans;
1851         struct namecache *ncp;
1852         const uint8_t *name;
1853         size_t name_len;
1854         int error;
1855
1856         dip = VTOI(ap->a_dvp);
1857         hmp = dip->hmp;
1858         if (hmp->ronly)
1859                 return (EROFS);
1860
1861         ncp = ap->a_nch->ncp;
1862         name = ncp->nc_name;
1863         name_len = ncp->nc_nlen;
1864         hammer2_trans_init(&trans, hmp);
1865
1866         ap->a_vap->va_type = VLNK;      /* enforce type */
1867
1868         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1869                                    name, name_len, &error);
1870         if (error) {
1871                 KKASSERT(nip == NULL);
1872                 *ap->a_vpp = NULL;
1873                 hammer2_trans_done(&trans);
1874                 return error;
1875         }
1876         *ap->a_vpp = hammer2_igetv(nip, &error);
1877
1878         /*
1879          * Build the softlink (~like file data) and finalize the namecache.
1880          */
1881         if (error == 0) {
1882                 size_t bytes;
1883                 struct uio auio;
1884                 struct iovec aiov;
1885                 hammer2_inode_data_t *nipdata;
1886
1887                 nipdata = &nip->chain->data->ipdata;
1888                 bytes = strlen(ap->a_target);
1889
1890                 if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1891                         KKASSERT(nipdata->op_flags &
1892                                  HAMMER2_OPFLAG_DIRECTDATA);
1893                         bcopy(ap->a_target, nipdata->u.data, bytes);
1894                         nipdata->size = bytes;
1895                 } else {
1896                         bzero(&auio, sizeof(auio));
1897                         bzero(&aiov, sizeof(aiov));
1898                         auio.uio_iov = &aiov;
1899                         auio.uio_segflg = UIO_SYSSPACE;
1900                         auio.uio_rw = UIO_WRITE;
1901                         auio.uio_resid = bytes;
1902                         auio.uio_iovcnt = 1;
1903                         auio.uio_td = curthread;
1904                         aiov.iov_base = ap->a_target;
1905                         aiov.iov_len = bytes;
1906                         error = hammer2_write_file(nip, &auio, IO_APPEND, 0);
1907                         /* XXX handle error */
1908                         error = 0;
1909                 }
1910         }
1911         hammer2_inode_unlock_ex(nip);
1912         hammer2_trans_done(&trans);
1913
1914         /*
1915          * Finalize namecache
1916          */
1917         if (error == 0) {
1918                 cache_setunresolved(ap->a_nch);
1919                 cache_setvp(ap->a_nch, *ap->a_vpp);
1920                 /* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
1921         }
1922         return error;
1923 }
1924
1925 /*
1926  * hammer2_vop_nremove { nch, dvp, cred }
1927  */
1928 static
1929 int
1930 hammer2_vop_nremove(struct vop_nremove_args *ap)
1931 {
1932         hammer2_inode_t *dip;
1933         hammer2_mount_t *hmp;
1934         hammer2_trans_t trans;
1935         struct namecache *ncp;
1936         const uint8_t *name;
1937         size_t name_len;
1938         int error;
1939
1940         dip = VTOI(ap->a_dvp);
1941         hmp = dip->hmp;
1942         if (hmp->ronly)
1943                 return(EROFS);
1944
1945         ncp = ap->a_nch->ncp;
1946         name = ncp->nc_name;
1947         name_len = ncp->nc_nlen;
1948         hammer2_trans_init(&trans, hmp);
1949         error = hammer2_unlink_file(&trans, dip, name, name_len, 0);
1950         hammer2_trans_done(&trans);
1951         if (error == 0) {
1952                 cache_unlink(ap->a_nch);
1953         }
1954         return (error);
1955 }
1956
1957 /*
1958  * hammer2_vop_nrmdir { nch, dvp, cred }
1959  */
1960 static
1961 int
1962 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
1963 {
1964         hammer2_inode_t *dip;
1965         hammer2_mount_t *hmp;
1966         hammer2_trans_t trans;
1967         struct namecache *ncp;
1968         const uint8_t *name;
1969         size_t name_len;
1970         int error;
1971
1972         dip = VTOI(ap->a_dvp);
1973         hmp = dip->hmp;
1974         if (hmp->ronly)
1975                 return(EROFS);
1976
1977         ncp = ap->a_nch->ncp;
1978         name = ncp->nc_name;
1979         name_len = ncp->nc_nlen;
1980
1981         hammer2_trans_init(&trans, hmp);
1982         error = hammer2_unlink_file(&trans, dip, name, name_len, 1);
1983         hammer2_trans_done(&trans);
1984         if (error == 0) {
1985                 cache_unlink(ap->a_nch);
1986         }
1987         return (error);
1988 }
1989
1990 /*
1991  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1992  */
1993 static
1994 int
1995 hammer2_vop_nrename(struct vop_nrename_args *ap)
1996 {
1997         struct namecache *fncp;
1998         struct namecache *tncp;
1999         hammer2_inode_t *fdip;
2000         hammer2_inode_t *tdip;
2001         hammer2_inode_t *ip;
2002         hammer2_chain_t *chain;
2003         hammer2_mount_t *hmp;
2004         hammer2_trans_t trans;
2005         const uint8_t *fname;
2006         size_t fname_len;
2007         const uint8_t *tname;
2008         size_t tname_len;
2009         int error;
2010
2011         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
2012                 return(EXDEV);
2013         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
2014                 return(EXDEV);
2015
2016         fdip = VTOI(ap->a_fdvp);        /* source directory */
2017         tdip = VTOI(ap->a_tdvp);        /* target directory */
2018
2019         hmp = fdip->hmp;                /* check read-only filesystem */
2020         if (hmp->ronly)
2021                 return(EROFS);
2022
2023         fncp = ap->a_fnch->ncp;         /* entry name in source */
2024         fname = fncp->nc_name;
2025         fname_len = fncp->nc_nlen;
2026
2027         tncp = ap->a_tnch->ncp;         /* entry name in target */
2028         tname = tncp->nc_name;
2029         tname_len = tncp->nc_nlen;
2030
2031         hammer2_trans_init(&trans, hmp);
2032
2033         /*
2034          * ip is the inode being removed.  If this is a hardlink then
2035          * ip represents the actual file and not the hardlink marker.
2036          */
2037         ip = VTOI(fncp->nc_vp);
2038         chain = NULL;
2039
2040         /*
2041          * Keep a tight grip on the inode so the temporary unlinking from
2042          * the source location prior to linking to the target location
2043          * does not cause the chain to be destroyed.
2044          *
2045          * NOTE: To avoid deadlocks we cannot lock (ip) while we are
2046          *       unlinking elements from their directories.  Locking
2047          *       the nlinks field does not lock the whole inode.
2048          */
2049         hammer2_inode_ref(ip);
2050
2051         /*
2052          * Remove target if it exists
2053          */
2054         error = hammer2_unlink_file(&trans, tdip, tname, tname_len, -1);
2055         if (error && error != ENOENT)
2056                 goto done;
2057         cache_setunresolved(ap->a_tnch);
2058
2059         /*
2060          * When renaming a hardlinked file we may have to re-consolidate
2061          * the location of the hardlink target.  Since the element is simply
2062          * being moved, nlinks is not modified in this case.
2063          *
2064          * If ip represents a regular file the consolidation code essentially
2065          * does nothing other than return the locked chain.
2066          *
2067          * The returned chain will be locked.
2068          */
2069         error = hammer2_hardlink_consolidate(&trans, ip, &chain, tdip, 0);
2070         if (error)
2071                 goto done;
2072
2073         /*
2074          * Disconnect (fdip, fname) from the source directory.  This will
2075          * disconnect (ip) if it represents a direct file.  If (ip) represents
2076          * a hardlink the HARDLINK pointer object will be removed but the
2077          * hardlink will stay intact.
2078          *
2079          * The target chain may be marked DELETED but will not be destroyed
2080          * since we retain our hold on ip and chain.
2081          */
2082         error = hammer2_unlink_file(&trans, fdip, fname, fname_len, -1);
2083         KKASSERT(error != EAGAIN);
2084         if (error)
2085                 goto done;
2086
2087         /*
2088          * Reconnect ip to target directory using chain.  Chains cannot
2089          * actually be moved, so this will duplicate the chain in the new
2090          * spot and assign it to the ip, replacing the old chain.
2091          *
2092          * WARNING: chain locks can lock buffer cache buffers, to avoid
2093          *          deadlocks we want to unlock before issuing a cache_*()
2094          *          op (that might have to lock a vnode).
2095          */
2096         error = hammer2_inode_connect(&trans, tdip,
2097                                       ip, &chain,
2098                                       tname, tname_len);
2099         if (error == 0) {
2100                 if (chain) {
2101                         hammer2_chain_unlock(chain);
2102                         chain = NULL;
2103                 }
2104                 cache_rename(ap->a_fnch, ap->a_tnch);
2105         }
2106 done:
2107         if (chain)
2108                 hammer2_chain_unlock(chain);
2109         hammer2_inode_drop(ip);
2110         hammer2_trans_done(&trans);
2111
2112         return (error);
2113 }
2114
2115 static int hammer2_strategy_read(struct vop_strategy_args *ap);
2116 static int hammer2_strategy_write(struct vop_strategy_args *ap);
2117
2118 static
2119 int
2120 hammer2_vop_strategy(struct vop_strategy_args *ap)
2121 {
2122         struct bio *biop;
2123         struct buf *bp;
2124         int error;
2125
2126         biop = ap->a_bio;
2127         bp = biop->bio_buf;
2128
2129         switch(bp->b_cmd) {
2130         case BUF_CMD_READ:
2131                 error = hammer2_strategy_read(ap);
2132                 ++hammer2_iod_file_read;
2133                 break;
2134         case BUF_CMD_WRITE:
2135                 error = hammer2_strategy_write(ap);
2136                 ++hammer2_iod_file_write;
2137                 break;
2138         default:
2139                 bp->b_error = error = EINVAL;
2140                 bp->b_flags |= B_ERROR;
2141                 biodone(biop);
2142                 break;
2143         }
2144
2145         return (error);
2146 }
2147
2148 static
2149 int
2150 hammer2_strategy_read(struct vop_strategy_args *ap)
2151 {
2152         struct buf *bp;
2153         struct bio *bio;
2154         struct bio *nbio;
2155         hammer2_mount_t *hmp;
2156         hammer2_inode_t *ip;
2157         hammer2_chain_t *parent;
2158         hammer2_chain_t *chain;
2159         hammer2_key_t lbase;
2160
2161         bio = ap->a_bio;
2162         bp = bio->bio_buf;
2163         ip = VTOI(ap->a_vp);
2164         hmp = ip->hmp;
2165         nbio = push_bio(bio);
2166
2167         lbase = bio->bio_offset;
2168         chain = NULL;
2169         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
2170
2171         /*
2172          * We must characterize the logical->physical translation if it
2173          * has not already been cached.
2174          *
2175          * Physical data references < LBUFSIZE are never cached.  This
2176          * includes both small-block allocations and inode-embedded data.
2177          */
2178         if (nbio->bio_offset == NOOFFSET) {
2179                 hammer2_inode_lock_sh(ip);
2180
2181                 parent = hammer2_chain_lookup_init(ip->chain,
2182                                                    HAMMER2_LOOKUP_SHARED);
2183
2184                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
2185                                              HAMMER2_LOOKUP_NODATA |
2186                                              HAMMER2_LOOKUP_SHARED);
2187                 if (chain == NULL) {
2188                         /*
2189                          * Data is zero-fill
2190                          */
2191                         nbio->bio_offset = ZFOFFSET;
2192                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2193                         /*
2194                          * Data is embedded in the inode (do nothing)
2195                          */
2196                         KKASSERT(chain == parent);
2197                         hammer2_chain_unlock(chain);
2198                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2199                         /*
2200                          * Data is on-media
2201                          */
2202                         KKASSERT(bp->b_bcount == chain->bytes);
2203                         nbio->bio_offset = chain->bref.data_off &
2204                                            HAMMER2_OFF_MASK;
2205                         hammer2_chain_unlock(chain);
2206                         KKASSERT(nbio->bio_offset != 0);
2207                 } else {
2208                         panic("hammer2_strategy_read: unknown bref type");
2209                 }
2210                 hammer2_chain_lookup_done(parent);
2211                 hammer2_inode_unlock_sh(ip);
2212         }
2213
2214         if (hammer2_debug & 0x0020) {
2215                 kprintf("read %016jx %016jx\n",
2216                         bio->bio_offset, nbio->bio_offset);
2217         }
2218
2219         if (nbio->bio_offset == ZFOFFSET) {
2220                 /*
2221                  * Data is zero-fill
2222                  */
2223                 bp->b_resid = 0;
2224                 bp->b_error = 0;
2225                 bzero(bp->b_data, bp->b_bcount);
2226                 biodone(nbio);
2227         } else if (nbio->bio_offset != NOOFFSET) {
2228                 /*
2229                  * Forward direct IO to the device
2230                  */
2231                 vn_strategy(hmp->devvp, nbio);
2232         } else {
2233                 /*
2234                  * Data is embedded in inode.
2235                  */
2236                 bcopy(chain->data->ipdata.u.data, bp->b_data,
2237                       HAMMER2_EMBEDDED_BYTES);
2238                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
2239                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
2240                 bp->b_resid = 0;
2241                 bp->b_error = 0;
2242                 biodone(nbio);
2243         }
2244         return (0);
2245 }
2246
2247 static
2248 int
2249 hammer2_strategy_write(struct vop_strategy_args *ap)
2250 {
2251         struct buf *bp;
2252         struct bio *bio;
2253         struct bio *nbio;
2254         hammer2_mount_t *hmp;
2255         hammer2_inode_t *ip;
2256
2257         bio = ap->a_bio;
2258         bp = bio->bio_buf;
2259         ip = VTOI(ap->a_vp);
2260         hmp = ip->hmp;
2261         nbio = push_bio(bio);
2262
2263         KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
2264         KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
2265
2266         if (nbio->bio_offset == NOOFFSET) {
2267                 /*
2268                  * Must be embedded in the inode.
2269                  *
2270                  * Because the inode is dirty, the chain must exist whether
2271                  * the inode is locked or not. XXX
2272                  */
2273                 KKASSERT(bio->bio_offset == 0);
2274                 KKASSERT(ip->chain && ip->chain->data);
2275                 bcopy(bp->b_data, ip->chain->data->ipdata.u.data,
2276                       HAMMER2_EMBEDDED_BYTES);
2277                 bp->b_resid = 0;
2278                 bp->b_error = 0;
2279                 biodone(nbio);
2280
2281                 /*
2282                  * This special flag does not follow the normal MODIFY rules
2283                  * because we might deadlock on ip.  Instead we depend on
2284                  * VOP_FSYNC() to detect the case.
2285                  */
2286                 atomic_set_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
2287         } else {
2288                 /*
2289                  * Forward direct IO to the device
2290                  */
2291                 vn_strategy(hmp->devvp, nbio);
2292         }
2293         return (0);
2294 }
2295
2296 /*
2297  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2298  */
2299 static
2300 int
2301 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2302 {
2303         hammer2_mount_t *hmp;
2304         hammer2_inode_t *ip;
2305         int error;
2306
2307         ip = VTOI(ap->a_vp);
2308         hmp = ip->hmp;
2309
2310         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2311                               ap->a_fflag, ap->a_cred);
2312         return (error);
2313 }
2314
2315 static
2316 int 
2317 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2318 {
2319         struct mount *mp;
2320         hammer2_pfsmount_t *pmp;
2321         int rc;
2322
2323         switch (ap->a_op) {
2324         case (MOUNTCTL_SET_EXPORT):
2325                 mp = ap->a_head.a_ops->head.vv_mount;
2326                 pmp = MPTOPMP(mp);
2327
2328                 if (ap->a_ctllen != sizeof(struct export_args))
2329                         rc = (EINVAL);
2330                 else
2331                         rc = vfs_export(mp, &pmp->export,
2332                                         (const struct export_args *)ap->a_ctl);
2333                 break;
2334         default:
2335                 rc = vop_stdmountctl(ap);
2336                 break;
2337         }
2338         return (rc);
2339 }
2340
2341 struct vop_ops hammer2_vnode_vops = {
2342         .vop_default    = vop_defaultop,
2343         .vop_fsync      = hammer2_vop_fsync,
2344         .vop_getpages   = vop_stdgetpages,
2345         .vop_putpages   = vop_stdputpages,
2346         .vop_access     = hammer2_vop_access,
2347         .vop_advlock    = hammer2_vop_advlock,
2348         .vop_close      = hammer2_vop_close,
2349         .vop_nlink      = hammer2_vop_nlink,
2350         .vop_ncreate    = hammer2_vop_ncreate,
2351         .vop_nsymlink   = hammer2_vop_nsymlink,
2352         .vop_nremove    = hammer2_vop_nremove,
2353         .vop_nrmdir     = hammer2_vop_nrmdir,
2354         .vop_nrename    = hammer2_vop_nrename,
2355         .vop_getattr    = hammer2_vop_getattr,
2356         .vop_setattr    = hammer2_vop_setattr,
2357         .vop_readdir    = hammer2_vop_readdir,
2358         .vop_readlink   = hammer2_vop_readlink,
2359         .vop_getpages   = vop_stdgetpages,
2360         .vop_putpages   = vop_stdputpages,
2361         .vop_read       = hammer2_vop_read,
2362         .vop_write      = hammer2_vop_write,
2363         .vop_open       = hammer2_vop_open,
2364         .vop_inactive   = hammer2_vop_inactive,
2365         .vop_reclaim    = hammer2_vop_reclaim,
2366         .vop_nresolve   = hammer2_vop_nresolve,
2367         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2368         .vop_nmkdir     = hammer2_vop_nmkdir,
2369         .vop_ioctl      = hammer2_vop_ioctl,
2370         .vop_mountctl   = hammer2_vop_mountctl,
2371         .vop_bmap       = hammer2_vop_bmap,
2372         .vop_strategy   = hammer2_vop_strategy,
2373 };
2374
2375 struct vop_ops hammer2_spec_vops = {
2376
2377 };
2378
2379 struct vop_ops hammer2_fifo_vops = {
2380
2381 };