Merge branch 'vendor/LESS'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * Kernel Filesystem interface
37  *
38  * NOTE! local ipdata pointers must be reloaded on any modifying operation
39  *       to the inode as its underlying chain may have changed.
40  */
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/proc.h>
48 #include <sys/namei.h>
49 #include <sys/mount.h>
50 #include <sys/vnode.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include "hammer2.h"
56
57 #define ZFOFFSET        (-2LL)
58
59 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
60                                 int seqcount);
61 static int hammer2_write_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
62                                 hammer2_chain_t **parentp,
63                                 struct uio *uio, int ioflag, int seqcount);
64 static hammer2_off_t hammer2_assign_physical(hammer2_trans_t *trans,
65                                 hammer2_inode_t *ip, hammer2_chain_t **parentp,
66                                 hammer2_key_t lbase, int lblksize,
67                                 int *errorp);
68 static void hammer2_extend_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
69                                 hammer2_chain_t **parentp, hammer2_key_t nsize);
70 static void hammer2_truncate_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
71                                 hammer2_chain_t **parentp, hammer2_key_t nsize);
72
73 static __inline
74 void
75 hammer2_knote(struct vnode *vp, int flags)
76 {
77         if (flags)
78                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
79 }
80
81 /*
82  * Last reference to a vnode is going away but it is still cached.
83  */
84 static
85 int
86 hammer2_vop_inactive(struct vop_inactive_args *ap)
87 {
88         hammer2_inode_t *ip;
89         hammer2_chain_t *parent;
90         struct vnode *vp;
91
92         vp = ap->a_vp;
93         ip = VTOI(vp);
94
95         /*
96          * Degenerate case
97          */
98         if (ip == NULL) {
99                 vrecycle(vp);
100                 return (0);
101         }
102
103         /*
104          * Detect updates to the embedded data which may be synchronized by
105          * the strategy code.  Simply mark the inode modified so it gets
106          * picked up by our normal flush.
107          */
108         parent = hammer2_inode_lock_ex(ip);
109         KKASSERT(parent);
110
111         /*
112          * Check for deleted inodes and recycle immediately.
113          */
114         if (parent->flags & HAMMER2_CHAIN_DELETED) {
115                 hammer2_inode_unlock_ex(ip, parent);
116                 vrecycle(vp);
117         } else {
118                 hammer2_inode_unlock_ex(ip, parent);
119         }
120         return (0);
121 }
122
123 /*
124  * Reclaim a vnode so that it can be reused; after the inode is
125  * disassociated, the filesystem must manage it alone.
126  */
127 static
128 int
129 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
130 {
131         hammer2_chain_t *chain;
132         hammer2_inode_t *ip;
133         hammer2_mount_t *hmp;
134 #if 0
135         hammer2_trans_t trans;
136 #endif
137         struct vnode *vp;
138
139         vp = ap->a_vp;
140         ip = VTOI(vp);
141         if (ip == NULL)
142                 return(0);
143         hmp = ip->hmp;
144
145         /*
146          * Set SUBMODIFIED so we can detect and propagate the DESTROYED
147          * bit in the flush code.
148          *
149          * ip->chain might be stale, correct it before checking as older
150          * versions of the chain are likely marked deleted even if the
151          * file hasn't been.  XXX ip->chain should never be stale on
152          * reclaim.
153          */
154         chain = hammer2_inode_lock_ex(ip);
155         if (chain->next_parent)
156                 kprintf("RECLAIM DUPLINKED IP: %p %p\n", ip, ip->chain);
157
158         /*
159          * The final close of a deleted file or directory marks it for
160          * destruction.  The DESTROYED flag allows the flusher to shortcut
161          * any modified blocks still unflushed (that is, just ignore them).
162          *
163          * HAMMER2 usually does not try to optimize the freemap by returning
164          * deleted blocks to it as it does not usually know how many snapshots
165          * might be referencing portions of the file/dir.  XXX TODO.
166          *
167          * XXX TODO - However, any modified file as-of when a snapshot is made
168          *            cannot use this optimization as some of the modifications
169          *            may wind up being part of the snapshot.
170          */
171         vp->v_data = NULL;
172         ip->vp = NULL;
173         if (chain->flags & HAMMER2_CHAIN_DELETED) {
174                 KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED);
175                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROYED |
176                                               HAMMER2_CHAIN_SUBMODIFIED);
177         }
178 #if 0
179         /*
180          * XXX chains will be flushed on sync, no need to do it here.
181          */
182         if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
183                             HAMMER2_CHAIN_DELETED |
184                             HAMMER2_CHAIN_SUBMODIFIED)) {
185                 hammer2_trans_init(ip->hmp, &trans, HAMMER2_TRANS_ISFLUSH);
186                 hammer2_chain_flush(&trans, chain);
187                 hammer2_trans_done(&trans);
188         }
189 #endif
190         hammer2_inode_unlock_ex(ip, chain);             /* unlock */
191         hammer2_inode_drop(ip);                         /* vp ref */
192         /* chain no longer referenced */
193         /* chain = NULL; not needed */
194
195         /*
196          * XXX handle background sync when ip dirty, kernel will no longer
197          * notify us regarding this inode because there is no longer a
198          * vnode attached to it.
199          */
200
201         return (0);
202 }
203
204 static
205 int
206 hammer2_vop_fsync(struct vop_fsync_args *ap)
207 {
208         hammer2_mount_t *hmp;
209         hammer2_inode_t *ip;
210         hammer2_trans_t trans;
211         hammer2_chain_t *chain;
212         struct vnode *vp;
213
214         vp = ap->a_vp;
215         ip = VTOI(vp);
216         hmp = ip->hmp;
217
218         hammer2_trans_init(hmp, &trans, HAMMER2_TRANS_ISFLUSH);
219         chain = hammer2_inode_lock_ex(ip);
220
221         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
222
223         /*
224          * Calling chain_flush here creates a lot of duplicative
225          * COW operations due to non-optimal vnode ordering.
226          *
227          * Only do it for an actual fsync() syscall.  The other forms
228          * which call this function will eventually call chain_flush
229          * on the volume root as a catch-all, which is far more optimal.
230          */
231         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
232         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
233                 hammer2_chain_flush(&trans, chain);
234         }
235         hammer2_inode_unlock_ex(ip, chain);
236         hammer2_trans_done(&trans);
237
238         return (0);
239 }
240
241 static
242 int
243 hammer2_vop_access(struct vop_access_args *ap)
244 {
245         hammer2_inode_t *ip = VTOI(ap->a_vp);
246         hammer2_inode_data_t *ipdata;
247         hammer2_chain_t *chain;
248         uid_t uid;
249         gid_t gid;
250         int error;
251
252         chain = hammer2_inode_lock_sh(ip);
253         ipdata = &chain->data->ipdata;
254         uid = hammer2_to_unix_xid(&ipdata->uid);
255         gid = hammer2_to_unix_xid(&ipdata->gid);
256         error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
257         hammer2_inode_unlock_sh(ip, chain);
258
259         return (error);
260 }
261
262 static
263 int
264 hammer2_vop_getattr(struct vop_getattr_args *ap)
265 {
266         hammer2_inode_data_t *ipdata;
267         hammer2_chain_t *chain;
268         hammer2_pfsmount_t *pmp;
269         hammer2_inode_t *ip;
270         struct vnode *vp;
271         struct vattr *vap;
272
273         vp = ap->a_vp;
274         vap = ap->a_vap;
275
276         ip = VTOI(vp);
277         pmp = ip->pmp;
278
279         chain = hammer2_inode_lock_sh(ip);
280         ipdata = &chain->data->ipdata;
281
282         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
283         vap->va_fileid = ipdata->inum;
284         vap->va_mode = ipdata->mode;
285         vap->va_nlink = ipdata->nlinks;
286         vap->va_uid = hammer2_to_unix_xid(&ipdata->uid);
287         vap->va_gid = hammer2_to_unix_xid(&ipdata->gid);
288         vap->va_rmajor = 0;
289         vap->va_rminor = 0;
290         vap->va_size = ipdata->size;
291         vap->va_blocksize = HAMMER2_PBUFSIZE;
292         vap->va_flags = ipdata->uflags;
293         hammer2_time_to_timespec(ipdata->ctime, &vap->va_ctime);
294         hammer2_time_to_timespec(ipdata->mtime, &vap->va_mtime);
295         hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
296         vap->va_gen = 1;
297         vap->va_bytes = vap->va_size;   /* XXX */
298         vap->va_type = hammer2_get_vtype(chain);
299         vap->va_filerev = 0;
300         vap->va_uid_uuid = ipdata->uid;
301         vap->va_gid_uuid = ipdata->gid;
302         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
303                           VA_FSID_UUID_VALID;
304
305         hammer2_inode_unlock_sh(ip, chain);
306
307         return (0);
308 }
309
310 static
311 int
312 hammer2_vop_setattr(struct vop_setattr_args *ap)
313 {
314         hammer2_inode_data_t *ipdata;
315         hammer2_inode_t *ip;
316         hammer2_mount_t *hmp;
317         hammer2_chain_t *chain;
318         hammer2_trans_t trans;
319         struct vnode *vp;
320         struct vattr *vap;
321         int error;
322         int kflags = 0;
323         int domtime = 0;
324         uint64_t ctime;
325
326         vp = ap->a_vp;
327         vap = ap->a_vap;
328         hammer2_update_time(&ctime);
329
330         ip = VTOI(vp);
331         hmp = ip->hmp;
332
333         if (hmp->ronly)
334                 return(EROFS);
335
336         hammer2_trans_init(hmp, &trans, 0);
337         chain = hammer2_inode_lock_ex(ip);
338         ipdata = &chain->data->ipdata;
339         error = 0;
340
341         if (vap->va_flags != VNOVAL) {
342                 u_int32_t flags;
343
344                 flags = ipdata->uflags;
345                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
346                                          hammer2_to_unix_xid(&ipdata->uid),
347                                          ap->a_cred);
348                 if (error == 0) {
349                         if (ipdata->uflags != flags) {
350                                 ipdata = hammer2_chain_modify_ip(&trans, ip,
351                                                                  &chain, 0);
352                                 ipdata->uflags = flags;
353                                 ipdata->ctime = ctime;
354                                 kflags |= NOTE_ATTRIB;
355                         }
356                         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
357                                 error = 0;
358                                 goto done;
359                         }
360                 }
361                 goto done;
362         }
363         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
364                 error = EPERM;
365                 goto done;
366         }
367         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
368                 mode_t cur_mode = ipdata->mode;
369                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
370                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
371                 uuid_t uuid_uid;
372                 uuid_t uuid_gid;
373
374                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
375                                          ap->a_cred,
376                                          &cur_uid, &cur_gid, &cur_mode);
377                 if (error == 0) {
378                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
379                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
380                         if (bcmp(&uuid_uid, &ipdata->uid, sizeof(uuid_uid)) ||
381                             bcmp(&uuid_gid, &ipdata->gid, sizeof(uuid_gid)) ||
382                             ipdata->mode != cur_mode
383                         ) {
384                                 ipdata = hammer2_chain_modify_ip(&trans, ip,
385                                                                  &chain, 0);
386                                 ipdata->uid = uuid_uid;
387                                 ipdata->gid = uuid_gid;
388                                 ipdata->mode = cur_mode;
389                                 ipdata->ctime = ctime;
390                         }
391                         kflags |= NOTE_ATTRIB;
392                 }
393         }
394
395         /*
396          * Resize the file
397          */
398         if (vap->va_size != VNOVAL && ipdata->size != vap->va_size) {
399                 switch(vp->v_type) {
400                 case VREG:
401                         if (vap->va_size == ipdata->size)
402                                 break;
403                         if (vap->va_size < ipdata->size) {
404                                 hammer2_truncate_file(&trans, ip,
405                                                       &chain, vap->va_size);
406                         } else {
407                                 hammer2_extend_file(&trans, ip,
408                                                     &chain, vap->va_size);
409                         }
410                         ipdata = &chain->data->ipdata; /* RELOAD */
411                         domtime = 1;
412                         break;
413                 default:
414                         error = EINVAL;
415                         goto done;
416                 }
417         }
418 #if 0
419         /* atime not supported */
420         if (vap->va_atime.tv_sec != VNOVAL) {
421                 ipdata = hammer2_chain_modify_ip(&trans, ip, &chain, 0);
422                 ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
423                 kflags |= NOTE_ATTRIB;
424         }
425 #endif
426         if (vap->va_mtime.tv_sec != VNOVAL) {
427                 ipdata = hammer2_chain_modify_ip(&trans, ip, &chain, 0);
428                 ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
429                 kflags |= NOTE_ATTRIB;
430         }
431         if (vap->va_mode != (mode_t)VNOVAL) {
432                 mode_t cur_mode = ipdata->mode;
433                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
434                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
435
436                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
437                                          cur_uid, cur_gid, &cur_mode);
438                 if (error == 0 && ipdata->mode != cur_mode) {
439                         ipdata = hammer2_chain_modify_ip(&trans, ip, &chain, 0);
440                         ipdata->mode = cur_mode;
441                         ipdata->ctime = ctime;
442                         kflags |= NOTE_ATTRIB;
443                 }
444         }
445 done:
446         hammer2_inode_unlock_ex(ip, chain);
447         hammer2_trans_done(&trans);
448         return (error);
449 }
450
451 static
452 int
453 hammer2_vop_readdir(struct vop_readdir_args *ap)
454 {
455         hammer2_inode_data_t *ipdata;
456         hammer2_mount_t *hmp;
457         hammer2_inode_t *ip;
458         hammer2_inode_t *xip;
459         hammer2_chain_t *parent;
460         hammer2_chain_t *chain;
461         hammer2_chain_t *xchain;
462         hammer2_tid_t inum;
463         hammer2_key_t lkey;
464         struct uio *uio;
465         off_t *cookies;
466         off_t saveoff;
467         int cookie_index;
468         int ncookies;
469         int error;
470         int dtype;
471         int r;
472
473         ip = VTOI(ap->a_vp);
474         hmp = ip->hmp;
475         uio = ap->a_uio;
476         saveoff = uio->uio_offset;
477
478         /*
479          * Setup cookies directory entry cookies if requested
480          */
481         if (ap->a_ncookies) {
482                 ncookies = uio->uio_resid / 16 + 1;
483                 if (ncookies > 1024)
484                         ncookies = 1024;
485                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
486         } else {
487                 ncookies = -1;
488                 cookies = NULL;
489         }
490         cookie_index = 0;
491
492         parent = hammer2_inode_lock_sh(ip);
493         ipdata = &parent->data->ipdata;
494
495         /*
496          * Handle artificial entries.  To ensure that only positive 64 bit
497          * quantities are returned to userland we always strip off bit 63.
498          * The hash code is designed such that codes 0x0000-0x7FFF are not
499          * used, allowing us to use these codes for articial entries.
500          *
501          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
502          * allow '..' to cross the mount point into (e.g.) the super-root.
503          */
504         error = 0;
505         chain = (void *)(intptr_t)-1;   /* non-NULL for early goto done case */
506
507         if (saveoff == 0) {
508                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
509                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
510                 if (r)
511                         goto done;
512                 if (cookies)
513                         cookies[cookie_index] = saveoff;
514                 ++saveoff;
515                 ++cookie_index;
516                 if (cookie_index == ncookies)
517                         goto done;
518         }
519
520         if (saveoff == 1) {
521                 /*
522                  * Be careful with lockorder when accessing ".."
523                  *
524                  * (ip is the current dir. xip is the parent dir).
525                  */
526                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
527                 while (ip->pip != NULL && ip != ip->pmp->iroot) {
528                         xip = ip->pip;
529                         hammer2_inode_ref(xip);
530                         hammer2_inode_unlock_sh(ip, parent);
531                         xchain = hammer2_inode_lock_sh(xip);
532                         parent = hammer2_inode_lock_sh(ip);
533                         hammer2_inode_drop(xip);
534                         if (xip == ip->pip) {
535                                 inum = xip->chain->data->ipdata.inum &
536                                        HAMMER2_DIRHASH_USERMSK;
537                                 hammer2_inode_unlock_sh(xip, xchain);
538                                 break;
539                         }
540                         hammer2_inode_unlock_sh(xip, xchain);
541                 }
542                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
543                 if (r)
544                         goto done;
545                 if (cookies)
546                         cookies[cookie_index] = saveoff;
547                 ++saveoff;
548                 ++cookie_index;
549                 if (cookie_index == ncookies)
550                         goto done;
551         }
552
553         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
554
555         /*
556          * parent is the inode chain, already locked for us.  Don't
557          * double lock shared locks as this will screw up upgrades.
558          */
559         if (error) {
560                 goto done;
561         }
562         chain = hammer2_chain_lookup(&parent, lkey, lkey,
563                                      HAMMER2_LOOKUP_SHARED);
564         if (chain == NULL) {
565                 chain = hammer2_chain_lookup(&parent,
566                                              lkey, (hammer2_key_t)-1,
567                                              HAMMER2_LOOKUP_SHARED);
568         }
569         while (chain) {
570                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
571                         dtype = hammer2_get_dtype(chain);
572                         saveoff = chain->bref.key & HAMMER2_DIRHASH_USERMSK;
573                         r = vop_write_dirent(&error, uio,
574                                              chain->data->ipdata.inum &
575                                               HAMMER2_DIRHASH_USERMSK,
576                                              dtype,
577                                              chain->data->ipdata.name_len,
578                                              chain->data->ipdata.filename);
579                         if (r)
580                                 break;
581                         if (cookies)
582                                 cookies[cookie_index] = saveoff;
583                         ++cookie_index;
584                 } else {
585                         /* XXX chain error */
586                         kprintf("bad chain type readdir %d\n",
587                                 chain->bref.type);
588                 }
589
590                 /*
591                  * Keys may not be returned in order so once we have a
592                  * placemarker (chain) the scan must allow the full range
593                  * or some entries will be missed.
594                  */
595                 chain = hammer2_chain_next(&parent, chain,
596                                            HAMMER2_DIRHASH_VISIBLE,
597                                            (hammer2_key_t)-1,
598                                            HAMMER2_LOOKUP_SHARED);
599                 if (chain) {
600                         saveoff = (chain->bref.key &
601                                    HAMMER2_DIRHASH_USERMSK) + 1;
602                 } else {
603                         saveoff = (hammer2_key_t)-1;
604                 }
605                 if (cookie_index == ncookies)
606                         break;
607         }
608         if (chain)
609                 hammer2_chain_unlock(chain);
610 done:
611         hammer2_inode_unlock_sh(ip, parent);
612         if (ap->a_eofflag)
613                 *ap->a_eofflag = (chain == NULL);
614         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
615         if (error && cookie_index == 0) {
616                 if (cookies) {
617                         kfree(cookies, M_TEMP);
618                         *ap->a_ncookies = 0;
619                         *ap->a_cookies = NULL;
620                 }
621         } else {
622                 if (cookies) {
623                         *ap->a_ncookies = cookie_index;
624                         *ap->a_cookies = cookies;
625                 }
626         }
627         return (error);
628 }
629
630 /*
631  * hammer2_vop_readlink { vp, uio, cred }
632  */
633 static
634 int
635 hammer2_vop_readlink(struct vop_readlink_args *ap)
636 {
637         struct vnode *vp;
638         hammer2_mount_t *hmp;
639         hammer2_inode_t *ip;
640         int error;
641
642         vp = ap->a_vp;
643         if (vp->v_type != VLNK)
644                 return (EINVAL);
645         ip = VTOI(vp);
646         hmp = ip->hmp;
647
648         error = hammer2_read_file(ip, ap->a_uio, 0);
649         return (error);
650 }
651
652 static
653 int
654 hammer2_vop_read(struct vop_read_args *ap)
655 {
656         struct vnode *vp;
657         hammer2_mount_t *hmp;
658         hammer2_inode_t *ip;
659         struct uio *uio;
660         int error;
661         int seqcount;
662         int bigread;
663
664         /*
665          * Read operations supported on this vnode?
666          */
667         vp = ap->a_vp;
668         if (vp->v_type != VREG)
669                 return (EINVAL);
670
671         /*
672          * Misc
673          */
674         ip = VTOI(vp);
675         hmp = ip->hmp;
676         uio = ap->a_uio;
677         error = 0;
678
679         seqcount = ap->a_ioflag >> 16;
680         bigread = (uio->uio_resid > 100 * 1024 * 1024);
681
682         error = hammer2_read_file(ip, uio, seqcount);
683         return (error);
684 }
685
686 static
687 int
688 hammer2_vop_write(struct vop_write_args *ap)
689 {
690         hammer2_mount_t *hmp;
691         hammer2_inode_t *ip;
692         hammer2_trans_t trans;
693         hammer2_chain_t *parent;
694         thread_t td;
695         struct vnode *vp;
696         struct uio *uio;
697         int error;
698         int seqcount;
699         int bigwrite;
700
701         /*
702          * Read operations supported on this vnode?
703          */
704         vp = ap->a_vp;
705         if (vp->v_type != VREG)
706                 return (EINVAL);
707
708         /*
709          * Misc
710          */
711         ip = VTOI(vp);
712         hmp = ip->hmp;
713         uio = ap->a_uio;
714         error = 0;
715         if (hmp->ronly)
716                 return (EROFS);
717
718         seqcount = ap->a_ioflag >> 16;
719         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
720
721         /*
722          * Check resource limit
723          */
724         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
725             uio->uio_offset + uio->uio_resid >
726              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
727                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
728                 return (EFBIG);
729         }
730
731         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
732
733         /*
734          * ip must be locked if extending the file.
735          * ip must be locked to avoid racing a truncation.
736          *
737          * ip must be marked modified, particularly because the write
738          * might wind up being copied into the embedded data area.
739          */
740         hammer2_trans_init(ip->hmp, &trans, 0);
741         parent = hammer2_inode_lock_ex(ip);
742         error = hammer2_write_file(&trans, ip, &parent,
743                                    uio, ap->a_ioflag, seqcount);
744         hammer2_inode_unlock_ex(ip, parent);
745         hammer2_trans_done(&trans);
746
747         return (error);
748 }
749
750 /*
751  * Perform read operations on a file or symlink given an UNLOCKED
752  * inode and uio.
753  *
754  * The passed ip is not locked.
755  */
756 static
757 int
758 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
759 {
760         hammer2_off_t size;
761         hammer2_chain_t *parent;
762         struct buf *bp;
763         int error;
764
765         error = 0;
766
767         /*
768          * UIO read loop.
769          */
770         parent = hammer2_inode_lock_sh(ip);
771         size = ip->chain->data->ipdata.size;
772
773         while (uio->uio_resid > 0 && uio->uio_offset < size) {
774                 hammer2_key_t lbase;
775                 hammer2_key_t leof;
776                 int lblksize;
777                 int loff;
778                 int n;
779
780                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
781                                                 &lbase, &leof);
782
783                 error = cluster_read(ip->vp, leof, lbase, lblksize,
784                                      uio->uio_resid, seqcount * BKVASIZE,
785                                      &bp);
786
787                 if (error)
788                         break;
789                 loff = (int)(uio->uio_offset - lbase);
790                 n = lblksize - loff;
791                 if (n > uio->uio_resid)
792                         n = uio->uio_resid;
793                 if (n > size - uio->uio_offset)
794                         n = (int)(size - uio->uio_offset);
795                 bp->b_flags |= B_AGE;
796                 uiomove((char *)bp->b_data + loff, n, uio);
797                 bqrelse(bp);
798         }
799         hammer2_inode_unlock_sh(ip, parent);
800         return (error);
801 }
802
803 /*
804  * Called with a locked (ip) to do the underlying write to a file or
805  * to build the symlink target.
806  */
807 static
808 int
809 hammer2_write_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
810                    hammer2_chain_t **parentp,
811                    struct uio *uio, int ioflag, int seqcount)
812 {
813         hammer2_inode_data_t *ipdata;
814         hammer2_key_t old_eof;
815         struct buf *bp;
816         int kflags;
817         int error;
818         int modified = 0;
819
820         /*
821          * Setup if append
822          */
823         ipdata = &ip->chain->data->ipdata;
824         if (ioflag & IO_APPEND)
825                 uio->uio_offset = ipdata->size;
826         kflags = 0;
827         error = 0;
828
829         /*
830          * vfs_sync visibility.  Interlocked by the inode ex lock so we
831          * shouldn't have to reassert it multiple times if the ip->chain
832          * is modified/flushed multiple times during the write, except
833          * when we release/reacquire the inode ex lock.
834          */
835         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
836
837         /*
838          * Extend the file if necessary.  If the write fails at some point
839          * we will truncate it back down to cover as much as we were able
840          * to write.
841          *
842          * Doing this now makes it easier to calculate buffer sizes in
843          * the loop.
844          */
845         KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
846         old_eof = ipdata->size;
847         if (uio->uio_offset + uio->uio_resid > ipdata->size) {
848                 modified = 1;
849                 hammer2_extend_file(trans, ip, parentp,
850                                     uio->uio_offset + uio->uio_resid);
851                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
852                 kflags |= NOTE_EXTEND;
853         }
854         KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
855
856         /*
857          * UIO write loop
858          */
859         while (uio->uio_resid > 0) {
860                 hammer2_key_t lbase;
861                 hammer2_key_t leof;
862                 int trivial;
863                 int lblksize;
864                 int loff;
865                 int n;
866
867                 /*
868                  * Don't allow the buffer build to blow out the buffer
869                  * cache.
870                  */
871                 if ((ioflag & IO_RECURSE) == 0) {
872                         /*
873                          * XXX should try to leave this unlocked through
874                          *      the whole loop
875                          */
876                         hammer2_inode_unlock_ex(ip, *parentp);
877                         bwillwrite(HAMMER2_PBUFSIZE);
878                         *parentp = hammer2_inode_lock_ex(ip);
879                         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
880                         ipdata = &ip->chain->data->ipdata;      /* reload */
881                 }
882
883                 /* XXX bigwrite & signal check test */
884
885                 /*
886                  * This nominally tells us how much we can cluster and
887                  * what the logical buffer size needs to be.  Currently
888                  * we don't try to cluster the write and just handle one
889                  * block at a time.
890                  */
891                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
892                                                 &lbase, &leof);
893                 loff = (int)(uio->uio_offset - lbase);
894
895                 /*
896                  * Calculate bytes to copy this transfer and whether the
897                  * copy completely covers the buffer or not.
898                  */
899                 trivial = 0;
900                 n = lblksize - loff;
901                 if (n > uio->uio_resid) {
902                         n = uio->uio_resid;
903                         if (loff == lbase &&
904                             uio->uio_offset + n == ipdata->size)
905                                 trivial = 1;
906                 } else if (loff == 0) {
907                         trivial = 1;
908                 }
909
910                 /*
911                  * Get the buffer
912                  */
913                 if (uio->uio_segflg == UIO_NOCOPY) {
914                         /*
915                          * Issuing a write with the same data backing the
916                          * buffer.  Instantiate the buffer to collect the
917                          * backing vm pages, then read-in any missing bits.
918                          *
919                          * This case is used by vop_stdputpages().
920                          */
921                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
922                         if ((bp->b_flags & B_CACHE) == 0) {
923                                 bqrelse(bp);
924                                 error = bread(ip->vp, lbase, lblksize, &bp);
925                         }
926                 } else if (trivial) {
927                         /*
928                          * Even though we are entirely overwriting the buffer
929                          * we may still have to zero it out to avoid a
930                          * mmap/write visibility issue.
931                          */
932                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
933                         if ((bp->b_flags & B_CACHE) == 0)
934                                 vfs_bio_clrbuf(bp);
935                 } else {
936                         /*
937                          * Partial overwrite, read in any missing bits then
938                          * replace the portion being written.
939                          *
940                          * (The strategy code will detect zero-fill physical
941                          * blocks for this case).
942                          */
943                         error = bread(ip->vp, lbase, lblksize, &bp);
944                         if (error == 0)
945                                 bheavy(bp);
946                 }
947
948                 if (error) {
949                         brelse(bp);
950                         break;
951                 }
952
953                 /*
954                  * We have to assign physical storage to the buffer we intend
955                  * to dirty or write now to avoid deadlocks in the strategy
956                  * code later.
957                  *
958                  * This can return NOOFFSET for inode-embedded data.  The
959                  * strategy code will take care of it in that case.
960                  */
961                 bp->b_bio2.bio_offset =
962                         hammer2_assign_physical(trans, ip, parentp,
963                                                 lbase, lblksize, &error);
964                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
965                 if (error) {
966                         brelse(bp);
967                         break;
968                 }
969
970                 /*
971                  * Ok, copy the data in
972                  */
973                 hammer2_inode_unlock_ex(ip, *parentp);
974                 error = uiomove(bp->b_data + loff, n, uio);
975                 *parentp = hammer2_inode_lock_ex(ip);
976                 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
977                 ipdata = &ip->chain->data->ipdata;      /* reload */
978                 kflags |= NOTE_WRITE;
979                 modified = 1;
980
981                 if (error) {
982                         brelse(bp);
983                         break;
984                 }
985
986                 /* XXX update ip_data.mtime */
987
988                 /*
989                  * Once we dirty a buffer any cached offset becomes invalid.
990                  *
991                  * NOTE: For cluster_write() always use the trailing block
992                  *       size, which is HAMMER2_PBUFSIZE.  lblksize is the
993                  *       eof-straddling blocksize and is incorrect.
994                  */
995                 bp->b_flags |= B_AGE;
996                 if (lbase == 0 && (ipdata->op_flags &
997                                    HAMMER2_OPFLAG_DIRECTDATA)) {
998                         /*
999                          * Writing to the inode's embedded data must be
1000                          * synchronous because the strategy code isn't
1001                          * allowed to acquire chain locks.
1002                          *
1003                          * Deal with chain interactions here.
1004                          */
1005                         ipdata = hammer2_chain_modify_ip(trans, ip, parentp, 0);
1006                         bwrite(bp);
1007                 } else if (ioflag & IO_SYNC) {
1008                         /*
1009                          * Synchronous I/O requested.
1010                          */
1011                         bwrite(bp);
1012                 } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
1013                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1014                                 bp->b_flags |= B_CLUSTEROK;
1015                         bdwrite(bp);
1016                 } else if (ioflag & IO_ASYNC) {
1017                         bawrite(bp);
1018                 } else if (hammer2_cluster_enable) {
1019                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1020                                 bp->b_flags |= B_CLUSTEROK;
1021                         cluster_write(bp, leof, HAMMER2_PBUFSIZE, seqcount);
1022                 } else {
1023                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1024                                 bp->b_flags |= B_CLUSTEROK;
1025                         bdwrite(bp);
1026                 }
1027         }
1028
1029         /*
1030          * Cleanup.  If we extended the file EOF but failed to write through
1031          * the entire write is a failure and we have to back-up.
1032          */
1033         if (error && ipdata->size != old_eof) {
1034                 hammer2_truncate_file(trans, ip, parentp, old_eof);
1035                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1036         } else if (modified) {
1037                 ipdata = hammer2_chain_modify_ip(trans, ip, parentp, 0);
1038                 hammer2_update_time(&ipdata->mtime);
1039         }
1040         hammer2_knote(ip->vp, kflags);
1041
1042         return error;
1043 }
1044
1045 /*
1046  * Assign physical storage to a logical block.  This function creates the
1047  * related meta-data chains representing the data blocks and marks them
1048  * MODIFIED.  We could mark them MOVED instead but ultimately I need to
1049  * XXX code the flusher to check that the related logical buffer is
1050  * flushed.
1051  *
1052  * NOOFFSET is returned if the data is inode-embedded.  In this case the
1053  * strategy code will simply bcopy() the data into the inode.
1054  *
1055  * The inode's delta_dcount is adjusted.
1056  */
1057 static
1058 hammer2_off_t
1059 hammer2_assign_physical(hammer2_trans_t *trans, hammer2_inode_t *ip,
1060                         hammer2_chain_t **parentp,
1061                         hammer2_key_t lbase, int lblksize, int *errorp)
1062 {
1063         hammer2_mount_t *hmp;
1064         hammer2_chain_t *parent;
1065         hammer2_chain_t *chain;
1066         hammer2_off_t pbase;
1067
1068         /*
1069          * Locate the chain associated with lbase, return a locked chain.
1070          * However, do not instantiate any data reference (which utilizes a
1071          * device buffer) because we will be using direct IO via the
1072          * logical buffer cache buffer.
1073          */
1074         hmp = ip->hmp;
1075         *errorp = 0;
1076 retry:
1077         parent = *parentp;
1078         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); /* extra lock */
1079         chain = hammer2_chain_lookup(&parent,
1080                                      lbase, lbase,
1081                                      HAMMER2_LOOKUP_NODATA);
1082
1083         if (chain == NULL) {
1084                 /*
1085                  * We found a hole, create a new chain entry.
1086                  *
1087                  * NOTE: DATA chains are created without device backing
1088                  *       store (nor do we want any).
1089                  */
1090                 *errorp = hammer2_chain_create(trans, &parent, &chain,
1091                                                lbase, HAMMER2_PBUFRADIX,
1092                                                HAMMER2_BREF_TYPE_DATA,
1093                                                lblksize);
1094                 if (chain == NULL) {
1095                         hammer2_chain_lookup_done(parent);
1096                         panic("hammer2_chain_create: par=%p error=%d\n",
1097                                 parent, *errorp);
1098                         goto retry;
1099                 }
1100
1101                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1102                 /*ip->delta_dcount += lblksize;*/
1103         } else {
1104                 switch (chain->bref.type) {
1105                 case HAMMER2_BREF_TYPE_INODE:
1106                         /*
1107                          * The data is embedded in the inode.  The
1108                          * caller is responsible for marking the inode
1109                          * modified and copying the data to the embedded
1110                          * area.
1111                          */
1112                         pbase = NOOFFSET;
1113                         break;
1114                 case HAMMER2_BREF_TYPE_DATA:
1115                         if (chain->bytes != lblksize) {
1116                                 panic("hammer2_assign_physical: "
1117                                       "size mismatch %d/%d\n",
1118                                       lblksize, chain->bytes);
1119                         }
1120                         hammer2_chain_modify(trans, &chain,
1121                                              HAMMER2_MODIFY_OPTDATA);
1122                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1123                         break;
1124                 default:
1125                         panic("hammer2_assign_physical: bad type");
1126                         /* NOT REACHED */
1127                         pbase = NOOFFSET;
1128                         break;
1129                 }
1130         }
1131
1132         /*
1133          * Cleanup.  If chain wound up being the inode (i.e. DIRECTDATA),
1134          * we might have to replace *parentp.
1135          */
1136         hammer2_chain_lookup_done(parent);
1137         if (chain) {
1138                 if (*parentp != chain &&
1139                     (*parentp)->core == chain->core) {
1140                         parent = *parentp;
1141                         *parentp = chain;
1142                         hammer2_chain_unlock(parent);
1143                 } else {
1144                         hammer2_chain_unlock(chain);
1145                 }
1146         }
1147
1148         return (pbase);
1149 }
1150
1151 /*
1152  * Truncate the size of a file.
1153  *
1154  * This routine adjusts ipdata->size smaller, destroying any related
1155  * data beyond the new EOF and potentially resizing the block straddling
1156  * the EOF.
1157  *
1158  * The inode must be locked.
1159  */
1160 static
1161 void
1162 hammer2_truncate_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
1163                       hammer2_chain_t **parentp, hammer2_key_t nsize)
1164 {
1165         hammer2_inode_data_t *ipdata;
1166         hammer2_chain_t *parent;
1167         hammer2_chain_t *chain;
1168         hammer2_key_t lbase;
1169         hammer2_key_t leof;
1170         struct buf *bp;
1171         int loff;
1172         int error;
1173         int oblksize;
1174         int nblksize;
1175
1176         bp = NULL;
1177         error = 0;
1178         ipdata = hammer2_chain_modify_ip(trans, ip, parentp, 0);
1179
1180         /*
1181          * Destroy any logical buffer cache buffers beyond the file EOF.
1182          *
1183          * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
1184          * around with the buffer straddling EOF, because we need to assign
1185          * a new physical offset to it.
1186          */
1187         if (ip->vp) {
1188                 nvtruncbuf(ip->vp, nsize,
1189                            HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
1190                            1);
1191         }
1192
1193         /*
1194          * Setup for lookup/search
1195          */
1196         parent = hammer2_chain_lookup_init(ip->chain, 0);
1197
1198         /*
1199          * Handle the case where a chain/logical-buffer straddles the new
1200          * EOF.  We told nvtruncbuf() above not to mess with the logical
1201          * buffer straddling the EOF because we need to reassign its storage
1202          * and can't let the strategy code do it for us.
1203          */
1204         loff = (int)nsize & HAMMER2_PBUFMASK;
1205         if (loff && ip->vp) {
1206                 oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1207                 error = bread(ip->vp, lbase, oblksize, &bp);
1208                 KKASSERT(error == 0);
1209         }
1210         ipdata->size = nsize;
1211         nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1212
1213         /*
1214          * Fixup the chain element.  If we have a logical buffer in-hand
1215          * we don't want to create a conflicting device buffer.
1216          */
1217         if (loff && bp) {
1218                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
1219                                              HAMMER2_LOOKUP_NODATA);
1220                 if (chain) {
1221                         switch(chain->bref.type) {
1222                         case HAMMER2_BREF_TYPE_DATA:
1223                                 hammer2_chain_resize(trans, ip, bp,
1224                                              parent, &chain,
1225                                              hammer2_allocsize(nblksize),
1226                                              HAMMER2_MODIFY_OPTDATA);
1227                                 allocbuf(bp, nblksize);
1228                                 bzero(bp->b_data + loff, nblksize - loff);
1229                                 bp->b_bio2.bio_offset = chain->bref.data_off &
1230                                                         HAMMER2_OFF_MASK;
1231                                 break;
1232                         case HAMMER2_BREF_TYPE_INODE:
1233                                 allocbuf(bp, nblksize);
1234                                 bzero(bp->b_data + loff, nblksize - loff);
1235                                 bp->b_bio2.bio_offset = NOOFFSET;
1236                                 break;
1237                         default:
1238                                 panic("hammer2_truncate_file: bad type");
1239                                 break;
1240                         }
1241                         hammer2_chain_unlock(chain);
1242                         if (lbase == 0 &&
1243                             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA)) {
1244                                 /*
1245                                  * Must be synchronous if writing to the
1246                                  * inode's embedded data area.
1247                                  */
1248                                 bwrite(bp);
1249                         } else {
1250                                 /*
1251                                  * Else a delayed-write is fine.
1252                                  */
1253                                 if (bp->b_bcount == HAMMER2_PBUFSIZE)
1254                                         bp->b_flags |= B_CLUSTEROK;
1255                                 bdwrite(bp);
1256                         }
1257                 } else {
1258                         /*
1259                          * Destroy clean buffer w/ wrong buffer size.  Retain
1260                          * backing store.
1261                          */
1262                         bp->b_flags |= B_RELBUF;
1263                         KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
1264                         KKASSERT((bp->b_flags & B_DIRTY) == 0);
1265                         bqrelse(bp);
1266                 }
1267         } else if (loff) {
1268                 /*
1269                  * WARNING: This utilizes a device buffer for the data.
1270                  *
1271                  * This case should not occur because file truncations without
1272                  * a vnode (and hence no logical buffer cache) should only
1273                  * always truncate to 0-length.
1274                  */
1275                 panic("hammer2_truncate_file: non-zero truncation, no-vnode");
1276 #if 0
1277                 chain = hammer2_chain_lookup(&parent, lbase, lbase, 0);
1278                 if (chain) {
1279                         switch(chain->bref.type) {
1280                         case HAMMER2_BREF_TYPE_DATA:
1281                                 chain = hammer2_chain_resize(trans, ip, bp,
1282                                              parent, chain,
1283                                              hammer2_allocsize(nblksize),
1284                                              0);
1285                                 hammer2_chain_modify(hmp, &chain, 0);
1286                                 bzero(chain->data->buf + loff, nblksize - loff);
1287                                 break;
1288                         case HAMMER2_BREF_TYPE_INODE:
1289                                 if (loff < HAMMER2_EMBEDDED_BYTES) {
1290                                         hammer2_chain_modify(hmp, &chain, 0);
1291                                         bzero(chain->data->ipdata.u.data + loff,
1292                                               HAMMER2_EMBEDDED_BYTES - loff);
1293                                 }
1294                                 break;
1295                         }
1296                         hammer2_chain_unlock(chain);
1297                 }
1298 #endif
1299         }
1300
1301         /*
1302          * Clean up any fragmentory VM pages now that we have properly
1303          * resized the straddling buffer.  These pages are no longer
1304          * part of the buffer.
1305          */
1306         if (ip->vp) {
1307                 nvtruncbuf(ip->vp, nsize,
1308                            nblksize, (int)nsize & (nblksize - 1),
1309                            1);
1310         }
1311
1312         /*
1313          * Destroy any physical blocks after the new EOF point.
1314          */
1315         lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
1316         chain = hammer2_chain_lookup(&parent,
1317                                      lbase, (hammer2_key_t)-1,
1318                                      HAMMER2_LOOKUP_NODATA);
1319         while (chain) {
1320                 /*
1321                  * Degenerate embedded data case, nothing to loop on.
1322                  */
1323                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1324                         hammer2_chain_unlock(chain);
1325                         break;
1326                 }
1327
1328                 /*
1329                  * Delete physical data blocks past the file EOF.
1330                  */
1331                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1332                         /*ip->delta_dcount -= chain->bytes;*/
1333                         hammer2_chain_delete(trans, chain);
1334                 }
1335                 /* XXX check parent if empty indirect block & delete */
1336                 chain = hammer2_chain_next(&parent, chain,
1337                                            lbase, (hammer2_key_t)-1,
1338                                            HAMMER2_LOOKUP_NODATA);
1339         }
1340         hammer2_chain_lookup_done(parent);
1341 }
1342
1343 /*
1344  * Extend the size of a file.  The inode must be locked.
1345  *
1346  * We may have to resize the block straddling the old EOF.
1347  */
1348 static
1349 void
1350 hammer2_extend_file(hammer2_trans_t *trans, hammer2_inode_t *ip,
1351                     hammer2_chain_t **parentp, hammer2_key_t nsize)
1352 {
1353         hammer2_inode_data_t *ipdata;
1354         hammer2_mount_t *hmp;
1355         hammer2_chain_t *parent;
1356         hammer2_chain_t *chain;
1357         struct buf *bp;
1358         hammer2_key_t osize;
1359         hammer2_key_t obase;
1360         hammer2_key_t nbase;
1361         hammer2_key_t leof;
1362         int oblksize;
1363         int nblksize;
1364         int nradix;
1365         int error;
1366
1367         KKASSERT(ip->vp);
1368         hmp = ip->hmp;
1369
1370         ipdata = hammer2_chain_modify_ip(trans, ip, parentp, 0);
1371
1372         /*
1373          * Nothing to do if the direct-data case is still intact
1374          */
1375         if ((ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1376             nsize <= HAMMER2_EMBEDDED_BYTES) {
1377                 ipdata->size = nsize;
1378                 nvextendbuf(ip->vp,
1379                             ipdata->size, nsize,
1380                             0, HAMMER2_EMBEDDED_BYTES,
1381                             0, (int)nsize,
1382                             1);
1383                 /* ipdata = &ip->chain->data->ipdata; RELOAD */
1384                 return;
1385         }
1386
1387         /*
1388          * Calculate the blocksize at the original EOF and resize the block
1389          * if necessary.  Adjust the file size in the inode.
1390          */
1391         osize = ipdata->size;
1392         oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
1393         ipdata->size = nsize;
1394         nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
1395
1396         /*
1397          * Do all required vnode operations, but do not mess with the
1398          * buffer straddling the orignal EOF.
1399          */
1400         nvextendbuf(ip->vp,
1401                     ipdata->size, nsize,
1402                     0, nblksize,
1403                     0, (int)nsize & HAMMER2_PBUFMASK,
1404                     1);
1405         ipdata = &ip->chain->data->ipdata;
1406
1407         /*
1408          * Early return if we have no more work to do.
1409          */
1410         if (obase == nbase && oblksize == nblksize &&
1411             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1412                 return;
1413         }
1414
1415         /*
1416          * We have work to do, including possibly resizing the buffer
1417          * at the previous EOF point and turning off DIRECTDATA mode.
1418          */
1419         bp = NULL;
1420         if (((int)osize & HAMMER2_PBUFMASK)) {
1421                 error = bread(ip->vp, obase, oblksize, &bp);
1422                 KKASSERT(error == 0);
1423         }
1424
1425         /*
1426          * Disable direct-data mode by loading up a buffer cache buffer
1427          * with the data, then converting the inode data area into the
1428          * inode indirect block array area.
1429          */
1430         if (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1431                 ipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1432                 bzero(&ipdata->u.blockset, sizeof(ipdata->u.blockset));
1433         }
1434
1435         /*
1436          * Resize the chain element at the old EOF.
1437          */
1438         if (((int)osize & HAMMER2_PBUFMASK)) {
1439 retry:
1440                 error = 0;
1441                 parent = hammer2_chain_lookup_init(ip->chain, 0);
1442                 nradix = hammer2_allocsize(nblksize);
1443
1444                 chain = hammer2_chain_lookup(&parent,
1445                                              obase, obase,
1446                                              HAMMER2_LOOKUP_NODATA);
1447                 if (chain == NULL) {
1448                         error = hammer2_chain_create(trans, &parent, &chain,
1449                                                      obase, nblksize,
1450                                                      HAMMER2_BREF_TYPE_DATA,
1451                                                      nblksize);
1452                         if (chain == NULL) {
1453                                 hammer2_chain_lookup_done(parent);
1454                                 panic("hammer2_chain_create: par=%p error=%d\n",
1455                                         parent, error);
1456                                 goto retry;
1457                         }
1458                         /*ip->delta_dcount += nblksize;*/
1459                 } else {
1460                         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1461                         hammer2_chain_resize(trans, ip, bp,
1462                                              parent, &chain,
1463                                              nradix,
1464                                              HAMMER2_MODIFY_OPTDATA);
1465                 }
1466                 if (obase != nbase) {
1467                         if (oblksize != HAMMER2_PBUFSIZE)
1468                                 allocbuf(bp, HAMMER2_PBUFSIZE);
1469                 } else {
1470                         if (oblksize != nblksize)
1471                                 allocbuf(bp, nblksize);
1472                 }
1473                 bp->b_bio2.bio_offset = chain->bref.data_off &
1474                                         HAMMER2_OFF_MASK;
1475                 hammer2_chain_unlock(chain);
1476                 if (bp->b_bcount == HAMMER2_PBUFSIZE)
1477                         bp->b_flags |= B_CLUSTEROK;
1478                 bdwrite(bp);
1479                 hammer2_chain_lookup_done(parent);  /* must be after bdwrite */
1480         }
1481 }
1482
1483 static
1484 int
1485 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1486 {
1487         hammer2_inode_t *ip;
1488         hammer2_inode_t *dip;
1489         hammer2_mount_t *hmp;
1490         hammer2_chain_t *parent;
1491         hammer2_chain_t *chain;
1492         hammer2_chain_t *ochain;
1493         hammer2_trans_t trans;
1494         struct namecache *ncp;
1495         const uint8_t *name;
1496         size_t name_len;
1497         hammer2_key_t lhc;
1498         int error = 0;
1499         struct vnode *vp;
1500
1501         dip = VTOI(ap->a_dvp);
1502         hmp = dip->hmp;
1503         ncp = ap->a_nch->ncp;
1504         name = ncp->nc_name;
1505         name_len = ncp->nc_nlen;
1506         lhc = hammer2_dirhash(name, name_len);
1507
1508         /*
1509          * Note: In DragonFly the kernel handles '.' and '..'.
1510          */
1511         parent = hammer2_inode_lock_sh(dip);
1512         chain = hammer2_chain_lookup(&parent,
1513                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1514                                      HAMMER2_LOOKUP_SHARED);
1515         while (chain) {
1516                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1517                     name_len == chain->data->ipdata.name_len &&
1518                     bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1519                         break;
1520                 }
1521                 chain = hammer2_chain_next(&parent, chain,
1522                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1523                                            HAMMER2_LOOKUP_SHARED);
1524         }
1525         hammer2_inode_unlock_sh(dip, parent);
1526
1527         /*
1528          * If the inode represents a forwarding entry for a hardlink we have
1529          * to locate the actual inode.  The original ip is saved for possible
1530          * deconsolidation.  (ip) will only be set to non-NULL when we have
1531          * to locate the real file via a hardlink.  ip will be referenced but
1532          * not locked in that situation.  chain is passed in locked and
1533          * returned locked.
1534          *
1535          * XXX what kind of chain lock?
1536          */
1537         ochain = NULL;
1538         if (chain && chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
1539                 error = hammer2_hardlink_find(dip, &chain, &ochain);
1540                 if (error) {
1541                         kprintf("hammer2: unable to find hardlink\n");
1542                         if (chain) {
1543                                 hammer2_chain_unlock(chain);
1544                                 chain = NULL;
1545                         }
1546                         goto failed;
1547                 }
1548         }
1549
1550         /*
1551          * Deconsolidate any hardlink whos nlinks == 1.  Ignore errors.
1552          * If an error occurs chain and ip are left alone.
1553          *
1554          * XXX upgrade shared lock?
1555          */
1556         if (ochain && chain && chain->data->ipdata.nlinks == 1 && !hmp->ronly) {
1557                 kprintf("hammer2: need to unconsolidate hardlink for %s\n",
1558                         chain->data->ipdata.filename);
1559                 /* XXX retain shared lock on dip? (currently not held) */
1560                 hammer2_trans_init(dip->hmp, &trans, 0);
1561                 hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain);
1562                 hammer2_trans_done(&trans);
1563         }
1564
1565         /*
1566          * Acquire the related vnode
1567          *
1568          * NOTE: For error processing, only ENOENT resolves the namecache
1569          *       entry to NULL, otherwise we just return the error and
1570          *       leave the namecache unresolved.
1571          *
1572          * NOTE: multiple hammer2_inode structures can be aliased to the
1573          *       same chain element, for example for hardlinks.  This
1574          *       use case does not 'reattach' inode associations that
1575          *       might already exist, but always allocates a new one.
1576          *
1577          * WARNING: inode structure is locked exclusively via inode_get
1578          *          but chain was locked shared.  inode_unlock_ex()
1579          *          will handle it properly.
1580          */
1581         if (chain) {
1582                 ip = hammer2_inode_get(hmp, dip->pmp, dip, chain);
1583                 vp = hammer2_igetv(ip, &error);
1584                 if (error == 0) {
1585                         vn_unlock(vp);
1586                         cache_setvp(ap->a_nch, vp);
1587                 } else if (error == ENOENT) {
1588                         cache_setvp(ap->a_nch, NULL);
1589                 }
1590                 hammer2_inode_unlock_ex(ip, chain);
1591
1592                 /*
1593                  * The vp should not be released until after we've disposed
1594                  * of our locks, because it might cause vop_inactive() to
1595                  * be called.
1596                  */
1597                 if (vp)
1598                         vrele(vp);
1599         } else {
1600                 error = ENOENT;
1601                 cache_setvp(ap->a_nch, NULL);
1602         }
1603 failed:
1604         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1605                 ("resolve error %d/%p chain %p ap %p\n",
1606                  error, ap->a_nch->ncp->nc_vp, chain, ap));
1607         if (ochain)
1608                 hammer2_chain_drop(ochain);
1609         return error;
1610 }
1611
1612 static
1613 int
1614 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1615 {
1616         hammer2_inode_t *dip;
1617         hammer2_inode_t *ip;
1618         hammer2_mount_t *hmp;
1619         hammer2_chain_t *parent;
1620         int error;
1621
1622         dip = VTOI(ap->a_dvp);
1623         hmp = dip->hmp;
1624
1625         if ((ip = dip->pip) == NULL) {
1626                 *ap->a_vpp = NULL;
1627                 return ENOENT;
1628         }
1629         parent = hammer2_inode_lock_ex(ip);
1630         *ap->a_vpp = hammer2_igetv(ip, &error);
1631         hammer2_inode_unlock_ex(ip, parent);
1632
1633         return error;
1634 }
1635
1636 static
1637 int
1638 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1639 {
1640         hammer2_mount_t *hmp;
1641         hammer2_inode_t *dip;
1642         hammer2_inode_t *nip;
1643         hammer2_trans_t trans;
1644         hammer2_chain_t *chain;
1645         struct namecache *ncp;
1646         const uint8_t *name;
1647         size_t name_len;
1648         int error;
1649
1650         dip = VTOI(ap->a_dvp);
1651         hmp = dip->hmp;
1652         if (hmp->ronly)
1653                 return (EROFS);
1654
1655         ncp = ap->a_nch->ncp;
1656         name = ncp->nc_name;
1657         name_len = ncp->nc_nlen;
1658
1659         hammer2_trans_init(hmp, &trans, 0);
1660         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1661                                    name, name_len, &chain, &error);
1662         if (error) {
1663                 KKASSERT(nip == NULL);
1664                 *ap->a_vpp = NULL;
1665         } else {
1666                 *ap->a_vpp = hammer2_igetv(nip, &error);
1667                 hammer2_inode_unlock_ex(nip, chain);
1668         }
1669         hammer2_trans_done(&trans);
1670
1671         if (error == 0) {
1672                 cache_setunresolved(ap->a_nch);
1673                 cache_setvp(ap->a_nch, *ap->a_vpp);
1674         }
1675         return error;
1676 }
1677
1678 /*
1679  * Return the largest contiguous physical disk range for the logical
1680  * request.
1681  *
1682  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
1683  */
1684 static
1685 int
1686 hammer2_vop_bmap(struct vop_bmap_args *ap)
1687 {
1688         struct vnode *vp;
1689         hammer2_mount_t *hmp;
1690         hammer2_inode_t *ip;
1691         hammer2_chain_t *parent;
1692         hammer2_chain_t *chain;
1693         hammer2_key_t lbeg;
1694         hammer2_key_t lend;
1695         hammer2_off_t pbeg;
1696         hammer2_off_t pbytes;
1697         hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
1698         int loff;
1699         int ai;
1700
1701         /*
1702          * Only supported on regular files
1703          *
1704          * Only supported for read operations (required for cluster_read).
1705          * The block allocation is delayed for write operations.
1706          */
1707         vp = ap->a_vp;
1708         if (vp->v_type != VREG)
1709                 return (EOPNOTSUPP);
1710         if (ap->a_cmd != BUF_CMD_READ)
1711                 return (EOPNOTSUPP);
1712
1713         ip = VTOI(vp);
1714         hmp = ip->hmp;
1715         bzero(array, sizeof(array));
1716
1717         /*
1718          * Calculate logical range
1719          */
1720         KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
1721         lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
1722         lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
1723         if (lend < lbeg)
1724                 lend = lbeg;
1725         loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
1726
1727         parent = hammer2_inode_lock_sh(ip);
1728         chain = hammer2_chain_lookup(&parent,
1729                                      lbeg, lend,
1730                                      HAMMER2_LOOKUP_NODATA |
1731                                      HAMMER2_LOOKUP_SHARED);
1732         if (chain == NULL) {
1733                 *ap->a_doffsetp = ZFOFFSET;
1734                 hammer2_inode_unlock_sh(ip, parent);
1735                 return (0);
1736         }
1737
1738         while (chain) {
1739                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1740                         ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
1741                         KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
1742                         array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
1743                         array[ai][1] = chain->bytes;
1744                 }
1745                 chain = hammer2_chain_next(&parent, chain,
1746                                            lbeg, lend,
1747                                            HAMMER2_LOOKUP_NODATA |
1748                                            HAMMER2_LOOKUP_SHARED);
1749         }
1750         hammer2_inode_unlock_sh(ip, parent);
1751
1752         /*
1753          * If the requested loffset is not mappable physically we can't
1754          * bmap.  The caller will have to access the file data via a
1755          * device buffer.
1756          */
1757         if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_LBUFSIZE) {
1758                 *ap->a_doffsetp = NOOFFSET;
1759                 return (0);
1760         }
1761
1762         /*
1763          * Calculate the physical disk offset range for array[0]
1764          */
1765         pbeg = array[0][0] + loff;
1766         pbytes = array[0][1] - loff;
1767
1768         for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
1769                 if (array[ai][0] != pbeg + pbytes)
1770                         break;
1771                 pbytes += array[ai][1];
1772         }
1773
1774         *ap->a_doffsetp = pbeg;
1775         if (ap->a_runp)
1776                 *ap->a_runp = pbytes;
1777         return (0);
1778 }
1779
1780 static
1781 int
1782 hammer2_vop_open(struct vop_open_args *ap)
1783 {
1784         return vop_stdopen(ap);
1785 }
1786
1787 /*
1788  * hammer2_vop_advlock { vp, id, op, fl, flags }
1789  */
1790 static
1791 int
1792 hammer2_vop_advlock(struct vop_advlock_args *ap)
1793 {
1794         hammer2_inode_t *ip = VTOI(ap->a_vp);
1795         hammer2_chain_t *parent;
1796         hammer2_off_t size;
1797
1798         parent = hammer2_inode_lock_sh(ip);
1799         size = parent->data->ipdata.size;
1800         hammer2_inode_unlock_sh(ip, parent);
1801         return (lf_advlock(ap, &ip->advlock, size));
1802 }
1803
1804
1805 static
1806 int
1807 hammer2_vop_close(struct vop_close_args *ap)
1808 {
1809         return vop_stdclose(ap);
1810 }
1811
1812 /*
1813  * hammer2_vop_nlink { nch, dvp, vp, cred }
1814  *
1815  * Create a hardlink from (vp) to {dvp, nch}.
1816  */
1817 static
1818 int
1819 hammer2_vop_nlink(struct vop_nlink_args *ap)
1820 {
1821         hammer2_inode_t *dip;   /* target directory to create link in */
1822         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1823         hammer2_mount_t *hmp;
1824         hammer2_chain_t *chain;
1825         hammer2_trans_t trans;
1826         struct namecache *ncp;
1827         const uint8_t *name;
1828         size_t name_len;
1829         int error;
1830
1831         dip = VTOI(ap->a_dvp);
1832         hmp = dip->hmp;
1833         if (hmp->ronly)
1834                 return (EROFS);
1835
1836         ncp = ap->a_nch->ncp;
1837         name = ncp->nc_name;
1838         name_len = ncp->nc_nlen;
1839         hammer2_trans_init(hmp, &trans, 0);
1840
1841         /*
1842          * ip represents the file being hardlinked.  The file could be a
1843          * normal file or a hardlink target if it has already been hardlinked.
1844          * If ip is a hardlinked target then ip->pip represents the location
1845          * of the hardlinked target, NOT the location of the hardlink pointer.
1846          *
1847          * Bump nlinks and potentially also create or move the hardlink
1848          * target in the parent directory common to (ip) and (dip).  The
1849          * consolidation code can modify ip->chain and ip->pip.  The
1850          * returned chain is locked.
1851          */
1852         ip = VTOI(ap->a_vp);
1853         chain = hammer2_inode_lock_ex(ip);
1854         error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1);
1855         if (error)
1856                 goto done;
1857
1858         /*
1859          * Create a directory entry connected to the specified chain.
1860          * The hardlink consolidation code has already adjusted ip->pip
1861          * to the common parent directory containing the actual hardlink
1862          *
1863          * (which may be different from dip where we created our hardlink
1864          * entry. ip->chain always represents the actual hardlink and not
1865          * any of the pointers to the actual hardlink).
1866          */
1867         error = hammer2_inode_connect(&trans, 1,
1868                                       dip, &chain,
1869                                       name, name_len);
1870         if (error == 0) {
1871                 cache_setunresolved(ap->a_nch);
1872                 cache_setvp(ap->a_nch, ap->a_vp);
1873         }
1874 done:
1875         hammer2_inode_unlock_ex(ip, chain);
1876         hammer2_trans_done(&trans);
1877
1878         return error;
1879 }
1880
1881 /*
1882  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1883  *
1884  * The operating system has already ensured that the directory entry
1885  * does not exist and done all appropriate namespace locking.
1886  */
1887 static
1888 int
1889 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1890 {
1891         hammer2_mount_t *hmp;
1892         hammer2_inode_t *dip;
1893         hammer2_inode_t *nip;
1894         hammer2_trans_t trans;
1895         hammer2_chain_t *nchain;
1896         struct namecache *ncp;
1897         const uint8_t *name;
1898         size_t name_len;
1899         int error;
1900
1901         dip = VTOI(ap->a_dvp);
1902         hmp = dip->hmp;
1903         if (hmp->ronly)
1904                 return (EROFS);
1905
1906         ncp = ap->a_nch->ncp;
1907         name = ncp->nc_name;
1908         name_len = ncp->nc_nlen;
1909         hammer2_trans_init(hmp, &trans, 0);
1910
1911         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1912                                    name, name_len, &nchain, &error);
1913         if (error) {
1914                 KKASSERT(nip == NULL);
1915                 *ap->a_vpp = NULL;
1916         } else {
1917                 *ap->a_vpp = hammer2_igetv(nip, &error);
1918                 hammer2_inode_unlock_ex(nip, nchain);
1919         }
1920         hammer2_trans_done(&trans);
1921
1922         if (error == 0) {
1923                 cache_setunresolved(ap->a_nch);
1924                 cache_setvp(ap->a_nch, *ap->a_vpp);
1925         }
1926         return error;
1927 }
1928
1929 /*
1930  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1931  */
1932 static
1933 int
1934 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1935 {
1936         hammer2_mount_t *hmp;
1937         hammer2_inode_t *dip;
1938         hammer2_inode_t *nip;
1939         hammer2_chain_t *nparent;
1940         hammer2_trans_t trans;
1941         struct namecache *ncp;
1942         const uint8_t *name;
1943         size_t name_len;
1944         int error;
1945
1946         dip = VTOI(ap->a_dvp);
1947         hmp = dip->hmp;
1948         if (hmp->ronly)
1949                 return (EROFS);
1950
1951         ncp = ap->a_nch->ncp;
1952         name = ncp->nc_name;
1953         name_len = ncp->nc_nlen;
1954         hammer2_trans_init(hmp, &trans, 0);
1955
1956         ap->a_vap->va_type = VLNK;      /* enforce type */
1957
1958         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1959                                    name, name_len, &nparent, &error);
1960         if (error) {
1961                 KKASSERT(nip == NULL);
1962                 *ap->a_vpp = NULL;
1963                 hammer2_trans_done(&trans);
1964                 return error;
1965         }
1966         *ap->a_vpp = hammer2_igetv(nip, &error);
1967
1968         /*
1969          * Build the softlink (~like file data) and finalize the namecache.
1970          */
1971         if (error == 0) {
1972                 size_t bytes;
1973                 struct uio auio;
1974                 struct iovec aiov;
1975                 hammer2_inode_data_t *nipdata;
1976
1977                 nipdata = &nip->chain->data->ipdata;
1978                 bytes = strlen(ap->a_target);
1979
1980                 if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1981                         KKASSERT(nipdata->op_flags &
1982                                  HAMMER2_OPFLAG_DIRECTDATA);
1983                         bcopy(ap->a_target, nipdata->u.data, bytes);
1984                         nipdata->size = bytes;
1985                 } else {
1986                         bzero(&auio, sizeof(auio));
1987                         bzero(&aiov, sizeof(aiov));
1988                         auio.uio_iov = &aiov;
1989                         auio.uio_segflg = UIO_SYSSPACE;
1990                         auio.uio_rw = UIO_WRITE;
1991                         auio.uio_resid = bytes;
1992                         auio.uio_iovcnt = 1;
1993                         auio.uio_td = curthread;
1994                         aiov.iov_base = ap->a_target;
1995                         aiov.iov_len = bytes;
1996                         error = hammer2_write_file(&trans, nip, &nparent,
1997                                                    &auio, IO_APPEND, 0);
1998                         nipdata = &nip->chain->data->ipdata; /* RELOAD */
1999                         /* XXX handle error */
2000                         error = 0;
2001                 }
2002         }
2003         hammer2_inode_unlock_ex(nip, nparent);
2004         hammer2_trans_done(&trans);
2005
2006         /*
2007          * Finalize namecache
2008          */
2009         if (error == 0) {
2010                 cache_setunresolved(ap->a_nch);
2011                 cache_setvp(ap->a_nch, *ap->a_vpp);
2012                 /* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
2013         }
2014         return error;
2015 }
2016
2017 /*
2018  * hammer2_vop_nremove { nch, dvp, cred }
2019  */
2020 static
2021 int
2022 hammer2_vop_nremove(struct vop_nremove_args *ap)
2023 {
2024         hammer2_inode_t *dip;
2025         hammer2_mount_t *hmp;
2026         hammer2_trans_t trans;
2027         struct namecache *ncp;
2028         const uint8_t *name;
2029         size_t name_len;
2030         int error;
2031
2032         dip = VTOI(ap->a_dvp);
2033         hmp = dip->hmp;
2034         if (hmp->ronly)
2035                 return(EROFS);
2036
2037         ncp = ap->a_nch->ncp;
2038         name = ncp->nc_name;
2039         name_len = ncp->nc_nlen;
2040         hammer2_trans_init(hmp, &trans, 0);
2041         error = hammer2_unlink_file(&trans, dip, name, name_len, 0, NULL);
2042         hammer2_trans_done(&trans);
2043         if (error == 0) {
2044                 cache_unlink(ap->a_nch);
2045         }
2046         return (error);
2047 }
2048
2049 /*
2050  * hammer2_vop_nrmdir { nch, dvp, cred }
2051  */
2052 static
2053 int
2054 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
2055 {
2056         hammer2_inode_t *dip;
2057         hammer2_mount_t *hmp;
2058         hammer2_trans_t trans;
2059         struct namecache *ncp;
2060         const uint8_t *name;
2061         size_t name_len;
2062         int error;
2063
2064         dip = VTOI(ap->a_dvp);
2065         hmp = dip->hmp;
2066         if (hmp->ronly)
2067                 return(EROFS);
2068
2069         ncp = ap->a_nch->ncp;
2070         name = ncp->nc_name;
2071         name_len = ncp->nc_nlen;
2072
2073         hammer2_trans_init(hmp, &trans, 0);
2074         error = hammer2_unlink_file(&trans, dip, name, name_len, 1, NULL);
2075         hammer2_trans_done(&trans);
2076         if (error == 0) {
2077                 cache_unlink(ap->a_nch);
2078         }
2079         return (error);
2080 }
2081
2082 /*
2083  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
2084  */
2085 static
2086 int
2087 hammer2_vop_nrename(struct vop_nrename_args *ap)
2088 {
2089         struct namecache *fncp;
2090         struct namecache *tncp;
2091         hammer2_inode_t *fdip;
2092         hammer2_inode_t *tdip;
2093         hammer2_inode_t *ip;
2094         hammer2_chain_t *chain;
2095         hammer2_mount_t *hmp;
2096         hammer2_trans_t trans;
2097         const uint8_t *fname;
2098         size_t fname_len;
2099         const uint8_t *tname;
2100         size_t tname_len;
2101         int error;
2102         int hlink;
2103
2104         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
2105                 return(EXDEV);
2106         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
2107                 return(EXDEV);
2108
2109         fdip = VTOI(ap->a_fdvp);        /* source directory */
2110         tdip = VTOI(ap->a_tdvp);        /* target directory */
2111
2112         hmp = fdip->hmp;                /* check read-only filesystem */
2113         if (hmp->ronly)
2114                 return(EROFS);
2115
2116         fncp = ap->a_fnch->ncp;         /* entry name in source */
2117         fname = fncp->nc_name;
2118         fname_len = fncp->nc_nlen;
2119
2120         tncp = ap->a_tnch->ncp;         /* entry name in target */
2121         tname = tncp->nc_name;
2122         tname_len = tncp->nc_nlen;
2123
2124         hammer2_trans_init(hmp, &trans, 0);
2125
2126         /*
2127          * ip is the inode being renamed.  If this is a hardlink then
2128          * ip represents the actual file and not the hardlink marker.
2129          */
2130         ip = VTOI(fncp->nc_vp);
2131         chain = NULL;
2132
2133         /*
2134          * Keep a tight grip on the inode so the temporary unlinking from
2135          * the source location prior to linking to the target location
2136          * does not cause the chain to be destroyed.
2137          *
2138          * NOTE: To avoid deadlocks we cannot lock (ip) while we are
2139          *       unlinking elements from their directories.  Locking
2140          *       the nlinks field does not lock the whole inode.
2141          */
2142         hammer2_inode_ref(ip);
2143
2144         /*
2145          * Remove target if it exists
2146          */
2147         error = hammer2_unlink_file(&trans, tdip, tname, tname_len, -1, NULL);
2148         if (error && error != ENOENT)
2149                 goto done;
2150         cache_setunresolved(ap->a_tnch);
2151
2152         /*
2153          * When renaming a hardlinked file we may have to re-consolidate
2154          * the location of the hardlink target.  Since the element is simply
2155          * being moved, nlinks is not modified in this case.
2156          *
2157          * If ip represents a regular file the consolidation code essentially
2158          * does nothing other than return the same locked chain that was
2159          * passed in.
2160          *
2161          * The returned chain will be locked.
2162          *
2163          * WARNING!  We do not currently have a local copy of ipdata but
2164          *           we do use one later remember that it must be reloaded
2165          *           on any modification to the inode, including connects.
2166          */
2167         chain = hammer2_inode_lock_ex(ip);
2168         error = hammer2_hardlink_consolidate(&trans, ip, &chain, tdip, 0);
2169         if (error)
2170                 goto done;
2171
2172         /*
2173          * Disconnect (fdip, fname) from the source directory.  This will
2174          * disconnect (ip) if it represents a direct file.  If (ip) represents
2175          * a hardlink the HARDLINK pointer object will be removed but the
2176          * hardlink will stay intact.
2177          *
2178          * The target chain may be marked DELETED but will not be destroyed
2179          * since we retain our hold on ip and chain.
2180          */
2181         error = hammer2_unlink_file(&trans, fdip, fname, fname_len, -1, &hlink);
2182         KKASSERT(error != EAGAIN);
2183         if (error)
2184                 goto done;
2185
2186         /*
2187          * Reconnect ip to target directory using chain.  Chains cannot
2188          * actually be moved, so this will duplicate the chain in the new
2189          * spot and assign it to the ip, replacing the old chain.
2190          *
2191          * WARNING: chain locks can lock buffer cache buffers, to avoid
2192          *          deadlocks we want to unlock before issuing a cache_*()
2193          *          op (that might have to lock a vnode).
2194          */
2195         error = hammer2_inode_connect(&trans, hlink,
2196                                       tdip, &chain,
2197                                       tname, tname_len);
2198         if (error == 0) {
2199                 KKASSERT(chain != NULL);
2200                 hammer2_inode_repoint(ip, (hlink ? ip->pip : tdip), chain);
2201                 cache_rename(ap->a_fnch, ap->a_tnch);
2202         }
2203 done:
2204         hammer2_inode_unlock_ex(ip, chain);
2205         hammer2_inode_drop(ip);
2206         hammer2_trans_done(&trans);
2207
2208         return (error);
2209 }
2210
2211 /*
2212  * Strategy code
2213  *
2214  * WARNING: The strategy code cannot safely use hammer2 transactions
2215  *          as this can deadlock against vfs_sync's vfsync() call
2216  *          if multiple flushes are queued.
2217  */
2218 static int hammer2_strategy_read(struct vop_strategy_args *ap);
2219 static int hammer2_strategy_write(struct vop_strategy_args *ap);
2220
2221 static
2222 int
2223 hammer2_vop_strategy(struct vop_strategy_args *ap)
2224 {
2225         struct bio *biop;
2226         struct buf *bp;
2227         int error;
2228
2229         biop = ap->a_bio;
2230         bp = biop->bio_buf;
2231
2232         switch(bp->b_cmd) {
2233         case BUF_CMD_READ:
2234                 error = hammer2_strategy_read(ap);
2235                 ++hammer2_iod_file_read;
2236                 break;
2237         case BUF_CMD_WRITE:
2238                 error = hammer2_strategy_write(ap);
2239                 ++hammer2_iod_file_write;
2240                 break;
2241         default:
2242                 bp->b_error = error = EINVAL;
2243                 bp->b_flags |= B_ERROR;
2244                 biodone(biop);
2245                 break;
2246         }
2247
2248         return (error);
2249 }
2250
2251 static
2252 int
2253 hammer2_strategy_read(struct vop_strategy_args *ap)
2254 {
2255         struct buf *bp;
2256         struct bio *bio;
2257         struct bio *nbio;
2258         hammer2_mount_t *hmp;
2259         hammer2_inode_t *ip;
2260         hammer2_chain_t *parent;
2261         hammer2_chain_t *chain;
2262         hammer2_key_t lbase;
2263
2264         bio = ap->a_bio;
2265         bp = bio->bio_buf;
2266         ip = VTOI(ap->a_vp);
2267         hmp = ip->hmp;
2268         nbio = push_bio(bio);
2269
2270         lbase = bio->bio_offset;
2271         chain = NULL;
2272         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
2273
2274 #if 0
2275         kprintf("read lbase %jd cached %016jx\n",
2276                 lbase, nbio->bio_offset);
2277 #endif
2278
2279         /*
2280          * We must characterize the logical->physical translation if it
2281          * has not already been cached.
2282          *
2283          * Physical data references < LBUFSIZE are never cached.  This
2284          * includes both small-block allocations and inode-embedded data.
2285          */
2286         if (nbio->bio_offset == NOOFFSET) {
2287                 parent = hammer2_inode_lock_sh(ip);
2288
2289                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
2290                                              HAMMER2_LOOKUP_NODATA |
2291                                              HAMMER2_LOOKUP_SHARED);
2292                 if (chain == NULL) {
2293                         /*
2294                          * Data is zero-fill
2295                          */
2296                         nbio->bio_offset = ZFOFFSET;
2297                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2298                         /*
2299                          * Data is embedded in the inode (do nothing)
2300                          */
2301                         KKASSERT(chain == parent);
2302                         hammer2_chain_unlock(chain);
2303                         nbio->bio_offset = NOOFFSET;
2304                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2305                         /*
2306                          * Data is on-media
2307                          */
2308                         KKASSERT(bp->b_bcount == chain->bytes);
2309                         nbio->bio_offset = chain->bref.data_off &
2310                                            HAMMER2_OFF_MASK;
2311                         hammer2_chain_unlock(chain);
2312                         KKASSERT(nbio->bio_offset != 0);
2313                 } else {
2314                         panic("hammer2_strategy_read: unknown bref type");
2315                 }
2316                 hammer2_inode_unlock_sh(ip, parent);
2317         }
2318
2319         if (hammer2_debug & 0x0020) {
2320                 kprintf("read %016jx %016jx\n",
2321                         bio->bio_offset, nbio->bio_offset);
2322         }
2323
2324         if (nbio->bio_offset == ZFOFFSET) {
2325                 /*
2326                  * Data is zero-fill
2327                  */
2328                 bp->b_resid = 0;
2329                 bp->b_error = 0;
2330                 bzero(bp->b_data, bp->b_bcount);
2331                 biodone(nbio);
2332         } else if (nbio->bio_offset != NOOFFSET) {
2333                 /*
2334                  * Forward direct IO to the device
2335                  */
2336                 vn_strategy(hmp->devvp, nbio);
2337         } else {
2338                 /*
2339                  * Data is embedded in inode.
2340                  */
2341                 bcopy(chain->data->ipdata.u.data, bp->b_data,
2342                       HAMMER2_EMBEDDED_BYTES);
2343                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
2344                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
2345                 bp->b_resid = 0;
2346                 bp->b_error = 0;
2347                 biodone(nbio);
2348         }
2349         return (0);
2350 }
2351
2352 static
2353 int
2354 hammer2_strategy_write(struct vop_strategy_args *ap)
2355 {
2356         struct buf *bp;
2357         struct bio *bio;
2358         struct bio *nbio;
2359         hammer2_chain_t *chain;
2360         hammer2_mount_t *hmp;
2361         hammer2_inode_t *ip;
2362
2363         bio = ap->a_bio;
2364         bp = bio->bio_buf;
2365         ip = VTOI(ap->a_vp);
2366         hmp = ip->hmp;
2367         nbio = push_bio(bio);
2368
2369         KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
2370         KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
2371
2372         if (nbio->bio_offset == NOOFFSET) {
2373                 /*
2374                  * The data is embedded in the inode.  Note that strategy
2375                  * calls for embedded data are synchronous in order to
2376                  * ensure that ip->chain is stable.  Chain modification
2377                  * status is handled by the caller.
2378                  */
2379                 KKASSERT(ip->chain->flags & HAMMER2_CHAIN_MODIFIED);
2380                 KKASSERT(bio->bio_offset == 0);
2381                 KKASSERT(ip->chain && ip->chain->data);
2382                 chain = ip->chain;
2383                 bcopy(bp->b_data, chain->data->ipdata.u.data,
2384                       HAMMER2_EMBEDDED_BYTES);
2385                 bp->b_resid = 0;
2386                 bp->b_error = 0;
2387                 biodone(nbio);
2388         } else {
2389                 /*
2390                  * Forward direct IO to the device
2391                  */
2392                 vn_strategy(hmp->devvp, nbio);
2393         }
2394         return (0);
2395 }
2396
2397 /*
2398  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2399  */
2400 static
2401 int
2402 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2403 {
2404         hammer2_mount_t *hmp;
2405         hammer2_inode_t *ip;
2406         int error;
2407
2408         ip = VTOI(ap->a_vp);
2409         hmp = ip->hmp;
2410
2411         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2412                               ap->a_fflag, ap->a_cred);
2413         return (error);
2414 }
2415
2416 static
2417 int 
2418 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2419 {
2420         struct mount *mp;
2421         hammer2_pfsmount_t *pmp;
2422         int rc;
2423
2424         switch (ap->a_op) {
2425         case (MOUNTCTL_SET_EXPORT):
2426                 mp = ap->a_head.a_ops->head.vv_mount;
2427                 pmp = MPTOPMP(mp);
2428
2429                 if (ap->a_ctllen != sizeof(struct export_args))
2430                         rc = (EINVAL);
2431                 else
2432                         rc = vfs_export(mp, &pmp->export,
2433                                         (const struct export_args *)ap->a_ctl);
2434                 break;
2435         default:
2436                 rc = vop_stdmountctl(ap);
2437                 break;
2438         }
2439         return (rc);
2440 }
2441
2442 struct vop_ops hammer2_vnode_vops = {
2443         .vop_default    = vop_defaultop,
2444         .vop_fsync      = hammer2_vop_fsync,
2445         .vop_getpages   = vop_stdgetpages,
2446         .vop_putpages   = vop_stdputpages,
2447         .vop_access     = hammer2_vop_access,
2448         .vop_advlock    = hammer2_vop_advlock,
2449         .vop_close      = hammer2_vop_close,
2450         .vop_nlink      = hammer2_vop_nlink,
2451         .vop_ncreate    = hammer2_vop_ncreate,
2452         .vop_nsymlink   = hammer2_vop_nsymlink,
2453         .vop_nremove    = hammer2_vop_nremove,
2454         .vop_nrmdir     = hammer2_vop_nrmdir,
2455         .vop_nrename    = hammer2_vop_nrename,
2456         .vop_getattr    = hammer2_vop_getattr,
2457         .vop_setattr    = hammer2_vop_setattr,
2458         .vop_readdir    = hammer2_vop_readdir,
2459         .vop_readlink   = hammer2_vop_readlink,
2460         .vop_getpages   = vop_stdgetpages,
2461         .vop_putpages   = vop_stdputpages,
2462         .vop_read       = hammer2_vop_read,
2463         .vop_write      = hammer2_vop_write,
2464         .vop_open       = hammer2_vop_open,
2465         .vop_inactive   = hammer2_vop_inactive,
2466         .vop_reclaim    = hammer2_vop_reclaim,
2467         .vop_nresolve   = hammer2_vop_nresolve,
2468         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2469         .vop_nmkdir     = hammer2_vop_nmkdir,
2470         .vop_ioctl      = hammer2_vop_ioctl,
2471         .vop_mountctl   = hammer2_vop_mountctl,
2472         .vop_bmap       = hammer2_vop_bmap,
2473         .vop_strategy   = hammer2_vop_strategy,
2474 };
2475
2476 struct vop_ops hammer2_spec_vops = {
2477
2478 };
2479
2480 struct vop_ops hammer2_fifo_vops = {
2481
2482 };