hammer2 - serialized flush work part 3
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
39 #include <sys/buf.h>
40 #include <sys/proc.h>
41 #include <sys/namei.h>
42 #include <sys/mount.h>
43 #include <sys/vnode.h>
44 #include <sys/mountctl.h>
45 #include <sys/dirent.h>
46 #include <sys/uio.h>
47
48 #include "hammer2.h"
49
50 #define ZFOFFSET        (-2LL)
51
52 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
53                                 int seqcount);
54 static int hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
55                                 struct uio *uio, int ioflag, int seqcount);
56 static hammer2_off_t hammer2_assign_physical(hammer2_inode_t *ip,
57                                 hammer2_key_t lbase, int lblksize, int *errorp);
58 static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize);
59 static void hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize);
60
61 static __inline
62 void
63 hammer2_knote(struct vnode *vp, int flags)
64 {
65         if (flags)
66                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
67 }
68
69 /*
70  * Last reference to a vnode is going away but it is still cached.
71  */
72 static
73 int
74 hammer2_vop_inactive(struct vop_inactive_args *ap)
75 {
76         hammer2_chain_t *chain;
77         hammer2_inode_t *ip;
78         struct vnode *vp;
79 #if 0
80         struct hammer2_mount *hmp;
81 #endif
82
83         vp = ap->a_vp;
84         ip = VTOI(vp);
85
86         /*
87          * Degenerate case
88          */
89         if (ip == NULL) {
90                 vrecycle(vp);
91                 return (0);
92         }
93
94         /*
95          * Detect updates to the embedded data which may be synchronized by
96          * the strategy code.  Simply mark the inode modified so it gets
97          * picked up by our normal flush.
98          */
99         chain = hammer2_inode_lock_ex(ip);
100         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
101                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
102                 hammer2_chain_modify(ip->hmp, chain, 0);
103         }
104
105         /*
106          * Check for deleted inodes and recycle immediately.
107          */
108         if (chain && (chain->flags & HAMMER2_CHAIN_DELETED)) {
109                 hammer2_inode_unlock_ex(ip, chain);
110                 vrecycle(vp);
111         } else {
112                 hammer2_inode_unlock_ex(ip, chain);
113         }
114         return (0);
115 }
116
117 /*
118  * Reclaim a vnode so that it can be reused; after the inode is
119  * disassociated, the filesystem must manage it alone.
120  */
121 static
122 int
123 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
124 {
125         hammer2_chain_t *chain;
126         hammer2_inode_t *ip;
127         hammer2_mount_t *hmp;
128         struct vnode *vp;
129
130         vp = ap->a_vp;
131         ip = VTOI(vp);
132         if (ip == NULL)
133                 return(0);
134         hmp = ip->hmp;
135
136         /*
137          * Set SUBMODIFIED so we can detect and propagate the DESTROYED
138          * bit in the flush code.
139          */
140         chain = hammer2_inode_lock_ex(ip);
141         vp->v_data = NULL;
142         ip->vp = NULL;
143         if (chain->flags & HAMMER2_CHAIN_DELETED) {
144                 KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED);
145                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROYED |
146                                               HAMMER2_CHAIN_SUBMODIFIED);
147         }
148         hammer2_chain_flush(hmp, chain, 0);
149         if (ip->refs > 2)                       /* (our lock + vp ref) */
150                 hammer2_inode_unlock_ex(ip, chain); /* unlock */
151         else
152                 hammer2_inode_put(ip, chain);       /* unlock & disconnect */
153         hammer2_inode_drop(ip);                     /* vp ref */
154
155         /*
156          * XXX handle background sync when ip dirty, kernel will no longer
157          * notify us regarding this inode because there is no longer a
158          * vnode attached to it.
159          */
160
161         return (0);
162 }
163
164 static
165 int
166 hammer2_vop_fsync(struct vop_fsync_args *ap)
167 {
168         hammer2_chain_t *chain;
169         hammer2_inode_t *ip;
170         hammer2_mount_t *hmp;
171         struct vnode *vp;
172
173         vp = ap->a_vp;
174         ip = VTOI(vp);
175         hmp = ip->hmp;
176
177         chain = hammer2_inode_lock_ex(ip);
178         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
179
180         /*
181          * Detect updates to the embedded data which may be synchronized by
182          * the strategy code.  Simply mark the inode modified so it gets
183          * picked up by our normal flush.
184          */
185         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
186                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
187                 hammer2_chain_modify(hmp, chain, 0);
188         }
189
190         /*
191          * Calling chain_flush here creates a lot of duplicative
192          * COW operations due to non-optimal vnode ordering.
193          *
194          * Only do it for an actual fsync() syscall.  The other forms
195          * which call this function will eventually call chain_flush
196          * on the volume root as a catch-all, which is far more optimal.
197          */
198         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
199         if (ap->a_flags & VOP_FSYNC_SYSCALL)
200                 hammer2_chain_flush(hmp, chain, 0);
201         hammer2_inode_unlock_ex(ip, chain);
202         return (0);
203 }
204
205 static
206 int
207 hammer2_vop_access(struct vop_access_args *ap)
208 {
209         hammer2_inode_t *ip = VTOI(ap->a_vp);
210         hammer2_chain_t *chain;
211         hammer2_inode_data_t *ipdata;
212         uid_t uid;
213         gid_t gid;
214         int error;
215
216         chain = hammer2_inode_lock_sh(ip);
217         ipdata = &chain->data->ipdata;
218         uid = hammer2_to_unix_xid(&ipdata->uid);
219         gid = hammer2_to_unix_xid(&ipdata->gid);
220         error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
221         hammer2_inode_unlock_sh(ip, chain);
222
223         return (error);
224 }
225
226 static
227 int
228 hammer2_vop_getattr(struct vop_getattr_args *ap)
229 {
230         hammer2_inode_data_t *ipdata;
231         hammer2_pfsmount_t *pmp;
232         hammer2_inode_t *ip;
233         hammer2_chain_t *chain;
234         struct vnode *vp;
235         struct vattr *vap;
236
237         vp = ap->a_vp;
238         vap = ap->a_vap;
239
240         ip = VTOI(vp);
241         pmp = ip->pmp;
242
243         chain = hammer2_inode_lock_sh(ip);
244         ipdata = &chain->data->ipdata;
245
246         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
247         vap->va_fileid = ipdata->inum;
248         vap->va_mode = ipdata->mode;
249         vap->va_nlink = ipdata->nlinks;
250         vap->va_uid = hammer2_to_unix_xid(&ipdata->uid);
251         vap->va_gid = hammer2_to_unix_xid(&ipdata->gid);
252         vap->va_rmajor = 0;
253         vap->va_rminor = 0;
254         vap->va_size = ipdata->size;
255         vap->va_blocksize = HAMMER2_PBUFSIZE;
256         vap->va_flags = ipdata->uflags;
257         hammer2_time_to_timespec(ipdata->ctime, &vap->va_ctime);
258         hammer2_time_to_timespec(ipdata->mtime, &vap->va_mtime);
259         hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
260         vap->va_gen = 1;
261         vap->va_bytes = vap->va_size;   /* XXX */
262         vap->va_type = hammer2_get_vtype(chain);
263         vap->va_filerev = 0;
264         vap->va_uid_uuid = ipdata->uid;
265         vap->va_gid_uuid = ipdata->gid;
266         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
267                           VA_FSID_UUID_VALID;
268
269         hammer2_inode_unlock_sh(ip, chain);
270
271         return (0);
272 }
273
274 static
275 int
276 hammer2_vop_setattr(struct vop_setattr_args *ap)
277 {
278         hammer2_inode_data_t *ipdata;
279         hammer2_chain_t *chain;
280         hammer2_inode_t *ip;
281         hammer2_mount_t *hmp;
282         struct vnode *vp;
283         struct vattr *vap;
284         int error;
285         int kflags = 0;
286         int domtime = 0;
287         uint64_t ctime;
288
289         vp = ap->a_vp;
290         vap = ap->a_vap;
291         hammer2_update_time(&ctime);
292
293         ip = VTOI(vp);
294         hmp = ip->hmp;
295
296         if (hmp->ronly)
297                 return(EROFS);
298
299         chain = hammer2_inode_lock_ex(ip);
300         ipdata = &chain->data->ipdata;
301         error = 0;
302
303         if (vap->va_flags != VNOVAL) {
304                 u_int32_t flags;
305
306                 flags = ipdata->uflags;
307                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
308                                          hammer2_to_unix_xid(&ipdata->uid),
309                                          ap->a_cred);
310                 if (error == 0) {
311                         if (ipdata->uflags != flags) {
312                                 hammer2_chain_modify(hmp, chain, 0);
313                                 ipdata->uflags = flags;
314                                 ipdata->ctime = ctime;
315                                 kflags |= NOTE_ATTRIB;
316                         }
317                         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
318                                 error = 0;
319                                 goto done;
320                         }
321                 }
322                 goto done;
323         }
324         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
325                 error = EPERM;
326                 goto done;
327         }
328         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
329                 mode_t cur_mode = ipdata->mode;
330                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
331                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
332                 uuid_t uuid_uid;
333                 uuid_t uuid_gid;
334
335                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
336                                          ap->a_cred,
337                                          &cur_uid, &cur_gid, &cur_mode);
338                 if (error == 0) {
339                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
340                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
341                         if (bcmp(&uuid_uid, &ipdata->uid, sizeof(uuid_uid)) ||
342                             bcmp(&uuid_gid, &ipdata->gid, sizeof(uuid_gid)) ||
343                             ipdata->mode != cur_mode
344                         ) {
345                                 hammer2_chain_modify(hmp, chain, 0);
346                                 ipdata->uid = uuid_uid;
347                                 ipdata->gid = uuid_gid;
348                                 ipdata->mode = cur_mode;
349                                 ipdata->ctime = ctime;
350                         }
351                         kflags |= NOTE_ATTRIB;
352                 }
353         }
354
355         /*
356          * Resize the file
357          */
358         if (vap->va_size != VNOVAL && ipdata->size != vap->va_size) {
359                 switch(vp->v_type) {
360                 case VREG:
361                         if (vap->va_size == ipdata->size)
362                                 break;
363                         if (vap->va_size < ipdata->size) {
364                                 hammer2_truncate_file(ip, vap->va_size);
365                         } else {
366                                 hammer2_extend_file(ip, vap->va_size);
367                         }
368                         domtime = 1;
369                         break;
370                 default:
371                         error = EINVAL;
372                         goto done;
373                 }
374         }
375 #if 0
376         /* atime not supported */
377         if (vap->va_atime.tv_sec != VNOVAL) {
378                 hammer2_chain_modify(hmp, chain, 0);
379                 ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
380                 kflags |= NOTE_ATTRIB;
381         }
382 #endif
383         if (vap->va_mtime.tv_sec != VNOVAL) {
384                 hammer2_chain_modify(hmp, chain, 0);
385                 ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
386                 kflags |= NOTE_ATTRIB;
387         }
388         if (vap->va_mode != (mode_t)VNOVAL) {
389                 mode_t cur_mode = ipdata->mode;
390                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
391                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
392
393                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
394                                          cur_uid, cur_gid, &cur_mode);
395                 if (error == 0 && ipdata->mode != cur_mode) {
396                         hammer2_chain_modify(hmp, chain, 0);
397                         ipdata->mode = cur_mode;
398                         ipdata->ctime = ctime;
399                         kflags |= NOTE_ATTRIB;
400                 }
401         }
402 done:
403         hammer2_inode_unlock_ex(ip, chain);
404         return (error);
405 }
406
407 static
408 int
409 hammer2_vop_readdir(struct vop_readdir_args *ap)
410 {
411         hammer2_inode_data_t *ipdata;
412         hammer2_mount_t *hmp;
413         hammer2_inode_t *ip;
414         hammer2_inode_t *xip;
415         hammer2_chain_t *parent;
416         hammer2_chain_t *xparent;
417         hammer2_chain_t *chain;
418         hammer2_tid_t inum;
419         hammer2_key_t lkey;
420         struct uio *uio;
421         off_t *cookies;
422         off_t saveoff;
423         int cookie_index;
424         int ncookies;
425         int error;
426         int dtype;
427         int r;
428
429         ip = VTOI(ap->a_vp);
430         hmp = ip->hmp;
431         uio = ap->a_uio;
432         saveoff = uio->uio_offset;
433
434         /*
435          * Setup cookies directory entry cookies if requested
436          */
437         if (ap->a_ncookies) {
438                 ncookies = uio->uio_resid / 16 + 1;
439                 if (ncookies > 1024)
440                         ncookies = 1024;
441                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
442         } else {
443                 ncookies = -1;
444                 cookies = NULL;
445         }
446         cookie_index = 0;
447
448         parent = hammer2_inode_lock_sh(ip);
449         ipdata = &parent->data->ipdata;
450
451         /*
452          * Handle artificial entries.  To ensure that only positive 64 bit
453          * quantities are returned to userland we always strip off bit 63.
454          * The hash code is designed such that codes 0x0000-0x7FFF are not
455          * used, allowing us to use these codes for articial entries.
456          *
457          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
458          * allow '..' to cross the mount point into (e.g.) the super-root.
459          */
460         error = 0;
461         chain = (void *)(intptr_t)-1;   /* non-NULL for early goto done case */
462
463         if (saveoff == 0) {
464                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
465                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
466                 if (r)
467                         goto done;
468                 if (cookies)
469                         cookies[cookie_index] = saveoff;
470                 ++saveoff;
471                 ++cookie_index;
472                 if (cookie_index == ncookies)
473                         goto done;
474         }
475
476         if (saveoff == 1) {
477                 /*
478                  * Be careful with lockorder when accessing ".."
479                  *
480                  * (parent is the current dir. xip is the parent dir).
481                  */
482                 inum = parent->data->ipdata.inum & HAMMER2_DIRHASH_USERMSK;
483                 while (ip->pip != NULL && ip != ip->pmp->iroot) {
484                         xip = ip->pip;
485                         hammer2_inode_ref(xip);
486                         hammer2_inode_unlock_sh(ip, parent);
487                         xparent = hammer2_inode_lock_sh(xip);
488                         parent = hammer2_inode_lock_sh(ip);
489                         hammer2_inode_drop(xip);
490                         if (xip == ip->pip) {
491                                 inum = xparent->data->ipdata.inum &
492                                        HAMMER2_DIRHASH_USERMSK;
493                                 hammer2_inode_unlock_sh(xip, xparent);
494                                 break;
495                         }
496                         hammer2_inode_unlock_sh(xip, xparent);
497                 }
498                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
499                 if (r)
500                         goto done;
501                 if (cookies)
502                         cookies[cookie_index] = saveoff;
503                 ++saveoff;
504                 ++cookie_index;
505                 if (cookie_index == ncookies)
506                         goto done;
507         }
508
509         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
510
511         /*
512          * parent is the inode chain, already locked for us.  Don't
513          * double lock shared locks as this will screw up upgrades.
514          */
515         if (error) {
516                 goto done;
517         }
518         chain = hammer2_chain_lookup(hmp, &parent, lkey, lkey,
519                                      HAMMER2_LOOKUP_SHARED);
520         if (chain == NULL) {
521                 chain = hammer2_chain_lookup(hmp, &parent,
522                                              lkey, (hammer2_key_t)-1,
523                                              HAMMER2_LOOKUP_SHARED);
524         }
525         while (chain) {
526                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
527                         dtype = hammer2_get_dtype(chain);
528                         saveoff = chain->bref.key & HAMMER2_DIRHASH_USERMSK;
529                         r = vop_write_dirent(&error, uio,
530                                              chain->data->ipdata.inum &
531                                               HAMMER2_DIRHASH_USERMSK,
532                                              dtype,
533                                              chain->data->ipdata.name_len,
534                                              chain->data->ipdata.filename);
535                         if (r)
536                                 break;
537                         if (cookies)
538                                 cookies[cookie_index] = saveoff;
539                         ++cookie_index;
540                 } else {
541                         /* XXX chain error */
542                         kprintf("bad chain type readdir %d\n",
543                                 chain->bref.type);
544                 }
545
546                 /*
547                  * Keys may not be returned in order so once we have a
548                  * placemarker (chain) the scan must allow the full range
549                  * or some entries will be missed.
550                  */
551                 chain = hammer2_chain_next(hmp, &parent, chain,
552                                            HAMMER2_DIRHASH_VISIBLE,
553                                            (hammer2_key_t)-1,
554                                            HAMMER2_LOOKUP_SHARED);
555                 if (chain) {
556                         saveoff = (chain->bref.key &
557                                    HAMMER2_DIRHASH_USERMSK) + 1;
558                 } else {
559                         saveoff = (hammer2_key_t)-1;
560                 }
561                 if (cookie_index == ncookies)
562                         break;
563         }
564         if (chain)
565                 hammer2_chain_unlock(hmp, chain);
566 done:
567         hammer2_inode_unlock_sh(ip, parent);
568         if (ap->a_eofflag)
569                 *ap->a_eofflag = (chain == NULL);
570         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
571         if (error && cookie_index == 0) {
572                 if (cookies) {
573                         kfree(cookies, M_TEMP);
574                         *ap->a_ncookies = 0;
575                         *ap->a_cookies = NULL;
576                 }
577         } else {
578                 if (cookies) {
579                         *ap->a_ncookies = cookie_index;
580                         *ap->a_cookies = cookies;
581                 }
582         }
583         return (error);
584 }
585
586 /*
587  * hammer2_vop_readlink { vp, uio, cred }
588  */
589 static
590 int
591 hammer2_vop_readlink(struct vop_readlink_args *ap)
592 {
593         struct vnode *vp;
594         hammer2_mount_t *hmp;
595         hammer2_inode_t *ip;
596         int error;
597
598         vp = ap->a_vp;
599         if (vp->v_type != VLNK)
600                 return (EINVAL);
601         ip = VTOI(vp);
602         hmp = ip->hmp;
603
604         error = hammer2_read_file(ip, ap->a_uio, 0);
605         return (error);
606 }
607
608 static
609 int
610 hammer2_vop_read(struct vop_read_args *ap)
611 {
612         struct vnode *vp;
613         hammer2_mount_t *hmp;
614         hammer2_inode_t *ip;
615         struct uio *uio;
616         int error;
617         int seqcount;
618         int bigread;
619
620         /*
621          * Read operations supported on this vnode?
622          */
623         vp = ap->a_vp;
624         if (vp->v_type != VREG)
625                 return (EINVAL);
626
627         /*
628          * Misc
629          */
630         ip = VTOI(vp);
631         hmp = ip->hmp;
632         uio = ap->a_uio;
633         error = 0;
634
635         seqcount = ap->a_ioflag >> 16;
636         bigread = (uio->uio_resid > 100 * 1024 * 1024);
637
638         error = hammer2_read_file(ip, uio, seqcount);
639         return (error);
640 }
641
642 static
643 int
644 hammer2_vop_write(struct vop_write_args *ap)
645 {
646         hammer2_chain_t *chain;
647         hammer2_mount_t *hmp;
648         hammer2_inode_t *ip;
649         thread_t td;
650         struct vnode *vp;
651         struct uio *uio;
652         int error;
653         int seqcount;
654         int bigwrite;
655
656         /*
657          * Read operations supported on this vnode?
658          */
659         vp = ap->a_vp;
660         if (vp->v_type != VREG)
661                 return (EINVAL);
662
663         /*
664          * Misc
665          */
666         ip = VTOI(vp);
667         hmp = ip->hmp;
668         uio = ap->a_uio;
669         error = 0;
670         if (hmp->ronly)
671                 return (EROFS);
672
673         seqcount = ap->a_ioflag >> 16;
674         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
675
676         /*
677          * Check resource limit
678          */
679         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
680             uio->uio_offset + uio->uio_resid >
681              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
682                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
683                 return (EFBIG);
684         }
685
686         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
687
688         /*
689          * ip must be locked if extending the file.
690          * ip must be locked to avoid racing a truncation.
691          *
692          * ip must be marked modified, particularly because the write
693          * might wind up being copied into the embedded data area.
694          */
695         chain = hammer2_inode_lock_ex(ip);
696         error = hammer2_write_file(ip, &chain, uio, ap->a_ioflag, seqcount);
697         hammer2_inode_unlock_ex(ip, chain);
698         return (error);
699 }
700
701 /*
702  * Perform read operations on a file or symlink given an UNLOCKED
703  * inode and uio.
704  *
705  * The passed ip is not locked.
706  */
707 static
708 int
709 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
710 {
711         hammer2_chain_t *chain;
712         hammer2_off_t size;
713         struct buf *bp;
714         int error;
715
716         error = 0;
717
718         /*
719          * UIO read loop.
720          *
721          * We can't hold a shared lock on ip's chain across file bread's
722          * because the bread operation will itself obtain a shared lock,
723          * resulting in one thread holding 2 shared refs.  This will deadlock
724          * against temporary lock upgrades.  Temporary lock upgrades are
725          * needed to insert new chain structures into a parent's RB tree.
726          *
727          * We should be able to safely retain the shared lock on ip itself.
728          */
729         chain = hammer2_inode_lock_sh(ip);
730         size = chain->data->ipdata.size;
731         hammer2_chain_unlock(ip->hmp, chain);
732         chain = NULL;
733
734         while (uio->uio_resid > 0 && uio->uio_offset < size) {
735                 hammer2_key_t lbase;
736                 hammer2_key_t leof;
737                 int lblksize;
738                 int loff;
739                 int n;
740
741                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
742                                                 &lbase, &leof);
743
744                 error = cluster_read(ip->vp, leof, lbase, lblksize,
745                                      uio->uio_resid, seqcount * BKVASIZE,
746                                      &bp);
747
748                 if (error)
749                         break;
750                 loff = (int)(uio->uio_offset - lbase);
751                 n = lblksize - loff;
752                 if (n > uio->uio_resid)
753                         n = uio->uio_resid;
754                 if (n > size - uio->uio_offset)
755                         n = (int)(size - uio->uio_offset);
756                 bp->b_flags |= B_AGE;
757                 uiomove((char *)bp->b_data + loff, n, uio);
758                 bqrelse(bp);
759         }
760         hammer2_inode_unlock_sh(ip, chain);
761         return (error);
762 }
763
764 /*
765  * Called with a locked (ip) to do the underlying write to a file or
766  * to build the symlink target.
767  */
768 static
769 int
770 hammer2_write_file(hammer2_inode_t *ip, hammer2_chain_t **chainp,
771                    struct uio *uio,
772                    int ioflag, int seqcount)
773 {
774         hammer2_inode_data_t *ipdata;
775         hammer2_key_t old_eof;
776         struct buf *bp;
777         int kflags;
778         int error;
779         int modified = 0;
780
781         /*
782          * Setup if append
783          */
784         ipdata = &ip->chain->data->ipdata;
785         if (ioflag & IO_APPEND)
786                 uio->uio_offset = ipdata->size;
787         kflags = 0;
788         error = 0;
789
790         /*
791          * Extend the file if necessary.  If the write fails at some point
792          * we will truncate it back down to cover as much as we were able
793          * to write.
794          *
795          * Doing this now makes it easier to calculate buffer sizes in
796          * the loop.
797          */
798         old_eof = ipdata->size;
799         if (uio->uio_offset + uio->uio_resid > ipdata->size) {
800                 modified = 1;
801                 hammer2_extend_file(ip, uio->uio_offset + uio->uio_resid);
802                 kflags |= NOTE_EXTEND;
803         }
804
805         /*
806          * UIO write loop
807          */
808         while (uio->uio_resid > 0) {
809                 hammer2_key_t lbase;
810                 hammer2_key_t leof;
811                 int trivial;
812                 int lblksize;
813                 int loff;
814                 int n;
815
816                 /*
817                  * Don't allow the buffer build to blow out the buffer
818                  * cache.
819                  */
820                 if ((ioflag & IO_RECURSE) == 0) {
821                         /*
822                          * XXX should try to leave this unlocked through
823                          *      the whole loop
824                          */
825                         hammer2_inode_unlock_ex(ip, *chainp);
826                         bwillwrite(HAMMER2_PBUFSIZE);
827                         *chainp = hammer2_inode_lock_ex(ip);
828                         ipdata = &(*chainp)->data->ipdata;      /* reload */
829                 }
830
831                 /* XXX bigwrite & signal check test */
832
833                 /*
834                  * This nominally tells us how much we can cluster and
835                  * what the logical buffer size needs to be.  Currently
836                  * we don't try to cluster the write and just handle one
837                  * block at a time.
838                  */
839                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
840                                                 &lbase, &leof);
841                 loff = (int)(uio->uio_offset - lbase);
842
843                 /*
844                  * Calculate bytes to copy this transfer and whether the
845                  * copy completely covers the buffer or not.
846                  */
847                 trivial = 0;
848                 n = lblksize - loff;
849                 if (n > uio->uio_resid) {
850                         n = uio->uio_resid;
851                         if (uio->uio_offset + n == ipdata->size)
852                                 trivial = 1;
853                 } else if (loff == 0) {
854                         trivial = 1;
855                 }
856
857                 /*
858                  * Get the buffer
859                  */
860                 if (uio->uio_segflg == UIO_NOCOPY) {
861                         /*
862                          * Issuing a write with the same data backing the
863                          * buffer.  Instantiate the buffer to collect the
864                          * backing vm pages, then read-in any missing bits.
865                          *
866                          * This case is used by vop_stdputpages().
867                          */
868                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
869                         if ((bp->b_flags & B_CACHE) == 0) {
870                                 bqrelse(bp);
871                                 error = bread(ip->vp, lbase, lblksize, &bp);
872                         }
873                 } else if (trivial) {
874                         /*
875                          * Even though we are entirely overwriting the buffer
876                          * we may still have to zero it out to avoid a
877                          * mmap/write visibility issue.
878                          */
879                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
880                         if ((bp->b_flags & B_CACHE) == 0)
881                                 vfs_bio_clrbuf(bp);
882                 } else {
883                         /*
884                          * Partial overwrite, read in any missing bits then
885                          * replace the portion being written.
886                          *
887                          * (The strategy code will detect zero-fill physical
888                          * blocks for this case).
889                          */
890                         error = bread(ip->vp, lbase, lblksize, &bp);
891                         if (error == 0)
892                                 bheavy(bp);
893                 }
894
895                 if (error) {
896                         brelse(bp);
897                         break;
898                 }
899
900                 /*
901                  * We have to assign physical storage to the buffer we intend
902                  * to dirty or write now to avoid deadlocks in the strategy
903                  * code later.
904                  *
905                  * This can return NOOFFSET for inode-embedded data.  The
906                  * strategy code will take care of it in that case.
907                  */
908                 bp->b_bio2.bio_offset =
909                         hammer2_assign_physical(ip, lbase, lblksize, &error);
910                 if (error) {
911                         brelse(bp);
912                         break;
913                 }
914
915                 /*
916                  * Ok, copy the data in
917                  */
918                 hammer2_inode_unlock_ex(ip, *chainp);
919                 error = uiomove(bp->b_data + loff, n, uio);
920                 *chainp = hammer2_inode_lock_ex(ip);
921                 ipdata = &(*chainp)->data->ipdata;      /* reload */
922                 kflags |= NOTE_WRITE;
923                 modified = 1;
924
925                 if (error) {
926                         brelse(bp);
927                         break;
928                 }
929
930                 /* XXX update ip_data.mtime */
931
932                 /*
933                  * Once we dirty a buffer any cached offset becomes invalid.
934                  *
935                  * NOTE: For cluster_write() always use the trailing block
936                  *       size, which is HAMMER2_PBUFSIZE.  lblksize is the
937                  *       eof-straddling blocksize and is incorrect.
938                  */
939                 bp->b_flags |= B_AGE;
940                 if (ioflag & IO_SYNC) {
941                         bwrite(bp);
942                 } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
943                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
944                                 bp->b_flags |= B_CLUSTEROK;
945                         bdwrite(bp);
946                 } else if (ioflag & IO_ASYNC) {
947                         bawrite(bp);
948                 } else if (hammer2_cluster_enable) {
949                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
950                                 bp->b_flags |= B_CLUSTEROK;
951                         cluster_write(bp, leof, HAMMER2_PBUFSIZE, seqcount);
952                 } else {
953                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
954                                 bp->b_flags |= B_CLUSTEROK;
955                         bdwrite(bp);
956                 }
957         }
958
959         /*
960          * Cleanup.  If we extended the file EOF but failed to write through
961          * the entire write is a failure and we have to back-up.
962          */
963         if (error && ipdata->size != old_eof) {
964                 hammer2_truncate_file(ip, old_eof);
965         } else if (modified) {
966                 KKASSERT(ip->chain == *chainp);
967                 hammer2_chain_modify(ip->hmp, *chainp, 0);
968                 hammer2_update_time(&ipdata->mtime);
969         }
970         hammer2_knote(ip->vp, kflags);
971         return error;
972 }
973
974 /*
975  * Assign physical storage to a logical block.
976  *
977  * NOOFFSET is returned if the data is inode-embedded.  In this case the
978  * strategy code will simply bcopy() the data into the inode.
979  *
980  * The inode's delta_dcount is adjusted.
981  */
982 static
983 hammer2_off_t
984 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
985                         int lblksize, int *errorp)
986 {
987         hammer2_mount_t *hmp;
988         hammer2_chain_t *parent;
989         hammer2_chain_t *chain;
990         hammer2_off_t pbase;
991
992         /*
993          * Locate the chain associated with lbase, return a locked chain.
994          * However, do not instantiate any data reference (which utilizes a
995          * device buffer) because we will be using direct IO via the
996          * logical buffer cache buffer.
997          */
998         hmp = ip->hmp;
999         *errorp = 0;
1000 retry:
1001         parent = hammer2_inode_lock_ex(ip);
1002         chain = hammer2_chain_lookup(hmp, &parent,
1003                                      lbase, lbase,
1004                                      HAMMER2_LOOKUP_NODATA);
1005
1006         if (chain == NULL) {
1007                 /*
1008                  * We found a hole, create a new chain entry.
1009                  *
1010                  * NOTE: DATA chains are created without device backing
1011                  *       store (nor do we want any).
1012                  */
1013                 chain = hammer2_chain_create(hmp, parent, NULL,
1014                                              lbase, HAMMER2_PBUFRADIX,
1015                                              HAMMER2_BREF_TYPE_DATA,
1016                                              lblksize, errorp);
1017                 if (chain == NULL) {
1018                         KKASSERT(*errorp == EAGAIN); /* XXX */
1019                         hammer2_inode_unlock_ex(ip, parent);
1020                         goto retry;
1021                 }
1022
1023                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1024                 /*ip->delta_dcount += lblksize;*/
1025         } else {
1026                 switch (chain->bref.type) {
1027                 case HAMMER2_BREF_TYPE_INODE:
1028                         /*
1029                          * The data is embedded in the inode.  The
1030                          * caller is responsible for marking the inode
1031                          * modified and copying the data to the embedded
1032                          * area.
1033                          */
1034                         pbase = NOOFFSET;
1035                         break;
1036                 case HAMMER2_BREF_TYPE_DATA:
1037                         if (chain->bytes != lblksize) {
1038                                 panic("hammer2_assign_physical: "
1039                                       "size mismatch %d/%d\n",
1040                                       lblksize, chain->bytes);
1041                         }
1042                         hammer2_chain_modify(hmp, chain,
1043                                              HAMMER2_MODIFY_OPTDATA);
1044                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1045                         break;
1046                 default:
1047                         panic("hammer2_assign_physical: bad type");
1048                         /* NOT REACHED */
1049                         pbase = NOOFFSET;
1050                         break;
1051                 }
1052         }
1053
1054         if (chain)
1055                 hammer2_chain_unlock(hmp, chain);
1056         hammer2_inode_unlock_ex(ip, parent);
1057
1058         return (pbase);
1059 }
1060
1061 /*
1062  * Truncate the size of a file.
1063  *
1064  * This routine adjusts ipdata->size smaller, destroying any related
1065  * data beyond the new EOF and potentially resizing the block straddling
1066  * the EOF.
1067  *
1068  * The inode must be locked.
1069  */
1070 static
1071 void
1072 hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1073 {
1074         hammer2_inode_data_t *ipdata;
1075         hammer2_chain_t *parent;
1076         hammer2_chain_t *chain;
1077         hammer2_mount_t *hmp = ip->hmp;
1078         hammer2_key_t lbase;
1079         hammer2_key_t leof;
1080         struct buf *bp;
1081         int loff;
1082         int error;
1083         int oblksize;
1084         int nblksize;
1085
1086         hammer2_chain_modify(hmp, ip->chain, 0);
1087         bp = NULL;
1088         ipdata = &ip->chain->data->ipdata;
1089
1090         /*
1091          * Destroy any logical buffer cache buffers beyond the file EOF.
1092          *
1093          * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
1094          * around with the buffer straddling EOF, because we need to assign
1095          * a new physical offset to it.
1096          */
1097         if (ip->vp) {
1098                 nvtruncbuf(ip->vp, nsize,
1099                            HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
1100                            1);
1101         }
1102
1103         /*
1104          * Setup for lookup/search
1105          */
1106         parent = ip->chain;
1107         error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
1108         if (error) {
1109                 hammer2_chain_unlock(hmp, parent);
1110                 /* XXX error reporting */
1111                 return;
1112         }
1113
1114         /*
1115          * Handle the case where a chain/logical-buffer straddles the new
1116          * EOF.  We told nvtruncbuf() above not to mess with the logical
1117          * buffer straddling the EOF because we need to reassign its storage
1118          * and can't let the strategy code do it for us.
1119          */
1120         loff = (int)nsize & HAMMER2_PBUFMASK;
1121         if (loff && ip->vp) {
1122                 oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1123                 error = bread(ip->vp, lbase, oblksize, &bp);
1124                 KKASSERT(error == 0);
1125         }
1126         ipdata->size = nsize;
1127         nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1128
1129         /*
1130          * Fixup the chain element.  If we have a logical buffer in-hand
1131          * we don't want to create a conflicting device buffer.
1132          */
1133         if (loff && bp) {
1134                 chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
1135                                              HAMMER2_LOOKUP_NODATA);
1136                 if (chain) {
1137                         allocbuf(bp, nblksize);
1138                         switch(chain->bref.type) {
1139                         case HAMMER2_BREF_TYPE_DATA:
1140                                 hammer2_chain_resize(ip, chain,
1141                                              hammer2_allocsize(nblksize),
1142                                              HAMMER2_MODIFY_OPTDATA);
1143                                 bzero(bp->b_data + loff, nblksize - loff);
1144                                 bp->b_bio2.bio_offset = chain->bref.data_off &
1145                                                         HAMMER2_OFF_MASK;
1146                                 break;
1147                         case HAMMER2_BREF_TYPE_INODE:
1148                                 bzero(bp->b_data + loff, nblksize - loff);
1149                                 bp->b_bio2.bio_offset = NOOFFSET;
1150                                 break;
1151                         default:
1152                                 panic("hammer2_truncate_file: bad type");
1153                                 break;
1154                         }
1155                         hammer2_chain_unlock(hmp, chain);
1156                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1157                                 bp->b_flags |= B_CLUSTEROK;
1158                         bdwrite(bp);
1159                 } else {
1160                         /*
1161                          * Destroy clean buffer w/ wrong buffer size.  Retain
1162                          * backing store.
1163                          */
1164                         bp->b_flags |= B_RELBUF;
1165                         KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
1166                         KKASSERT((bp->b_flags & B_DIRTY) == 0);
1167                         bqrelse(bp);
1168                 }
1169         } else if (loff) {
1170                 /*
1171                  * WARNING: This utilizes a device buffer for the data.
1172                  *
1173                  * This case should not occur because file truncations without
1174                  * a vnode (and hence no logical buffer cache) should only
1175                  * always truncate to 0-length.
1176                  */
1177                 panic("hammer2_truncate_file: non-zero truncation, no-vnode");
1178 #if 0
1179                 chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase, 0);
1180                 if (chain) {
1181                         switch(chain->bref.type) {
1182                         case HAMMER2_BREF_TYPE_DATA:
1183                                 hammer2_chain_resize(ip, chain,
1184                                              hammer2_allocsize(nblksize),
1185                                              0);
1186                                 hammer2_chain_modify(hmp, chain, 0);
1187                                 bzero(chain->data->buf + loff, nblksize - loff);
1188                                 break;
1189                         case HAMMER2_BREF_TYPE_INODE:
1190                                 if (loff < HAMMER2_EMBEDDED_BYTES) {
1191                                         hammer2_chain_modify(hmp, chain, 0);
1192                                         bzero(chain->data->ipdata.u.data + loff,
1193                                               HAMMER2_EMBEDDED_BYTES - loff);
1194                                 }
1195                                 break;
1196                         }
1197                         hammer2_chain_unlock(hmp, chain);
1198                 }
1199 #endif
1200         }
1201
1202         /*
1203          * Clean up any fragmentory VM pages now that we have properly
1204          * resized the straddling buffer.  These pages are no longer
1205          * part of the buffer.
1206          */
1207         if (ip->vp) {
1208                 nvtruncbuf(ip->vp, nsize,
1209                            nblksize, (int)nsize & (nblksize - 1),
1210                            1);
1211         }
1212
1213         /*
1214          * Destroy any physical blocks after the new EOF point.
1215          */
1216         lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
1217         chain = hammer2_chain_lookup(hmp, &parent,
1218                                      lbase, (hammer2_key_t)-1,
1219                                      HAMMER2_LOOKUP_NODATA);
1220         while (chain) {
1221                 /*
1222                  * Degenerate embedded data case, nothing to loop on.
1223                  */
1224                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1225                         hammer2_chain_unlock(hmp, chain);
1226                         break;
1227                 }
1228
1229                 /*
1230                  * Delete physical data blocks past the file EOF.
1231                  */
1232                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1233                         /*ip->delta_dcount -= chain->bytes;*/
1234                         hammer2_chain_delete(hmp, parent, chain, 0);
1235                 }
1236                 /* XXX check parent if empty indirect block & delete */
1237                 chain = hammer2_chain_next(hmp, &parent, chain,
1238                                            lbase, (hammer2_key_t)-1,
1239                                            HAMMER2_LOOKUP_NODATA);
1240         }
1241         hammer2_chain_unlock(hmp, parent);
1242 }
1243
1244 /*
1245  * Extend the size of a file.  The inode must be locked.
1246  *
1247  * We may have to resize the block straddling the old EOF.
1248  */
1249 static
1250 void
1251 hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1252 {
1253         hammer2_inode_data_t *ipdata;
1254         hammer2_mount_t *hmp;
1255         hammer2_chain_t *parent;
1256         hammer2_chain_t *chain;
1257         struct buf *bp;
1258         hammer2_key_t osize;
1259         hammer2_key_t obase;
1260         hammer2_key_t nbase;
1261         hammer2_key_t leof;
1262         int oblksize;
1263         int nblksize;
1264         int nradix;
1265         int error;
1266
1267         KKASSERT(ip->vp);
1268         hmp = ip->hmp;
1269
1270         hammer2_chain_modify(hmp, ip->chain, 0);
1271         ipdata = &ip->chain->data->ipdata;
1272
1273         /*
1274          * Nothing to do if the direct-data case is still intact
1275          */
1276         if ((ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1277             nsize <= HAMMER2_EMBEDDED_BYTES) {
1278                 ipdata->size = nsize;
1279                 nvextendbuf(ip->vp,
1280                             ipdata->size, nsize,
1281                             0, HAMMER2_EMBEDDED_BYTES,
1282                             0, (int)nsize,
1283                             1);
1284                 return;
1285         }
1286
1287         /*
1288          * Calculate the blocksize at the original EOF and resize the block
1289          * if necessary.  Adjust the file size in the inode.
1290          */
1291         osize = ipdata->size;
1292         oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
1293         ipdata->size = nsize;
1294         nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
1295
1296         /*
1297          * Do all required vnode operations, but do not mess with the
1298          * buffer straddling the orignal EOF.
1299          */
1300         nvextendbuf(ip->vp,
1301                     ipdata->size, nsize,
1302                     0, nblksize,
1303                     0, (int)nsize & HAMMER2_PBUFMASK,
1304                     1);
1305
1306         /*
1307          * Early return if we have no more work to do.
1308          */
1309         if (obase == nbase && oblksize == nblksize &&
1310             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1311                 return;
1312         }
1313
1314         /*
1315          * We have work to do, including possibly resizing the buffer
1316          * at the previous EOF point and turning off DIRECTDATA mode.
1317          */
1318         bp = NULL;
1319         if (((int)osize & HAMMER2_PBUFMASK)) {
1320                 error = bread(ip->vp, obase, oblksize, &bp);
1321                 KKASSERT(error == 0);
1322
1323                 if (obase != nbase) {
1324                         if (oblksize != HAMMER2_PBUFSIZE)
1325                                 allocbuf(bp, HAMMER2_PBUFSIZE);
1326                 } else {
1327                         if (oblksize != nblksize)
1328                                 allocbuf(bp, nblksize);
1329                 }
1330         }
1331
1332         /*
1333          * Disable direct-data mode by loading up a buffer cache buffer
1334          * with the data, then converting the inode data area into the
1335          * inode indirect block array area.
1336          */
1337         if (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1338                 ipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1339                 bzero(&ipdata->u.blockset, sizeof(ipdata->u.blockset));
1340         }
1341
1342         /*
1343          * Resize the chain element at the old EOF.
1344          */
1345         if (((int)osize & HAMMER2_PBUFMASK)) {
1346 retry:
1347                 parent = ip->chain;
1348                 error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
1349                 KKASSERT(error == 0);
1350
1351                 nradix = hammer2_allocsize(nblksize);
1352
1353                 chain = hammer2_chain_lookup(hmp, &parent,
1354                                              obase, obase,
1355                                              HAMMER2_LOOKUP_NODATA);
1356                 if (chain == NULL) {
1357                         chain = hammer2_chain_create(hmp, parent, NULL,
1358                                                      obase, nblksize,
1359                                                      HAMMER2_BREF_TYPE_DATA,
1360                                                      nblksize, &error);
1361                         if (chain == NULL) {
1362                                 KKASSERT(error == EAGAIN);
1363                                 hammer2_chain_unlock(hmp, parent);
1364                                 goto retry;
1365                         }
1366                         /*ip->delta_dcount += nblksize;*/
1367                 } else {
1368                         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1369                         hammer2_chain_resize(ip, chain, nradix,
1370                                              HAMMER2_MODIFY_OPTDATA);
1371                 }
1372                 bp->b_bio2.bio_offset = chain->bref.data_off &
1373                                         HAMMER2_OFF_MASK;
1374                 hammer2_chain_unlock(hmp, chain);
1375                 if (bp->b_bcount == HAMMER2_PBUFSIZE)
1376                         bp->b_flags |= B_CLUSTEROK;
1377                 bdwrite(bp);
1378                 hammer2_chain_unlock(hmp, parent);
1379         }
1380 }
1381
1382 static
1383 int
1384 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1385 {
1386         hammer2_inode_t *ip;
1387         hammer2_inode_t *dip;
1388         hammer2_mount_t *hmp;
1389         hammer2_chain_t *parent;
1390         hammer2_chain_t *chain;
1391         hammer2_chain_t *ochain;
1392         struct namecache *ncp;
1393         const uint8_t *name;
1394         size_t name_len;
1395         hammer2_key_t lhc;
1396         int error = 0;
1397         struct vnode *vp;
1398
1399         dip = VTOI(ap->a_dvp);
1400         hmp = dip->hmp;
1401         ncp = ap->a_nch->ncp;
1402         name = ncp->nc_name;
1403         name_len = ncp->nc_nlen;
1404         lhc = hammer2_dirhash(name, name_len);
1405
1406         /*
1407          * Note: In DragonFly the kernel handles '.' and '..'.
1408          */
1409         parent = hammer2_inode_lock_sh(dip);
1410         chain = hammer2_chain_lookup(hmp, &parent,
1411                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1412                                      HAMMER2_LOOKUP_SHARED);
1413         while (chain) {
1414                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1415                     name_len == chain->data->ipdata.name_len &&
1416                     bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1417                         break;
1418                 }
1419                 chain = hammer2_chain_next(hmp, &parent, chain,
1420                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1421                                            HAMMER2_LOOKUP_SHARED);
1422         }
1423         hammer2_inode_unlock_sh(dip, parent);
1424
1425         /*
1426          * If the inode represents a forwarding entry for a hardlink we have
1427          * to locate the actual inode.  The original ip is saved for possible
1428          * deconsolidation.  (ip) will only be set to non-NULL when we have
1429          * to locate the real file via a hardlink.  ip will be referenced but
1430          * not locked in that situation.  chain is passed in locked and
1431          * returned locked.
1432          *
1433          * XXX what kind of chain lock?
1434          */
1435         ochain = NULL;
1436         if (chain && chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
1437                 error = hammer2_hardlink_find(dip, &chain, &ochain);
1438                 if (error) {
1439                         kprintf("hammer2: unable to find hardlink\n");
1440                         if (chain) {
1441                                 hammer2_chain_unlock(hmp, chain);
1442                                 chain = NULL;
1443                         }
1444                         goto failed;
1445                 }
1446         }
1447
1448         /*
1449          * Deconsolidate any hardlink whos nlinks == 1.  Ignore errors.
1450          * If an error occurs chain and ip are left alone.
1451          *
1452          * XXX upgrade shared lock?
1453          */
1454         if (ochain && chain && chain->data->ipdata.nlinks == 1 && !hmp->ronly) {
1455                 kprintf("hammer2: need to unconsolidate hardlink for %s\n",
1456                         chain->data->ipdata.filename);
1457                 /* XXX retain shared lock on dip? (currently not held) */
1458                 hammer2_hardlink_deconsolidate(dip, &chain, &ochain);
1459         }
1460
1461         /*
1462          * Acquire the related vnode
1463          *
1464          * NOTE: For error processing, only ENOENT resolves the namecache
1465          *       entry to NULL, otherwise we just return the error and
1466          *       leave the namecache unresolved.
1467          *
1468          * NOTE: multiple hammer2_inode structures can be aliased to the
1469          *       same chain element, for example for hardlinks.  This
1470          *       use case does not 'reattach' inode associations that
1471          *       might already exist, but always allocates a new one.
1472          */
1473         if (chain) {
1474                 ip = hammer2_inode_get(dip->hmp, dip->pmp, dip, chain);
1475                 vp = hammer2_igetv(ip, &error);
1476                 if (error == 0) {
1477                         vn_unlock(vp);
1478                         cache_setvp(ap->a_nch, vp);
1479                 } else if (error == ENOENT) {
1480                         cache_setvp(ap->a_nch, NULL);
1481                 }
1482                 /*
1483                  * don't break the API, chain is locked shared so unlock
1484                  * it separately even though unlock_ex() currently doesn't
1485                  * care.
1486                  */
1487                 hammer2_inode_unlock_ex(ip, NULL);
1488                 hammer2_chain_unlock(hmp, chain);
1489
1490                 /*
1491                  * The vp should not be released until after we've disposed
1492                  * of our locks, because it might cause vop_inactive() to
1493                  * be called.
1494                  */
1495                 if (vp)
1496                         vrele(vp);
1497         } else {
1498                 error = ENOENT;
1499                 cache_setvp(ap->a_nch, NULL);
1500         }
1501 failed:
1502         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1503                 ("resolve error %d/%p chain %p ap %p\n",
1504                  error, ap->a_nch->ncp->nc_vp, chain, ap));
1505         if (ochain)
1506                 hammer2_chain_drop(hmp, ochain);
1507         return error;
1508 }
1509
1510 static
1511 int
1512 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1513 {
1514         hammer2_chain_t *chain;
1515         hammer2_inode_t *dip;
1516         hammer2_inode_t *ip;
1517         hammer2_mount_t *hmp;
1518         int error;
1519
1520         dip = VTOI(ap->a_dvp);
1521         hmp = dip->hmp;
1522
1523         if ((ip = dip->pip) == NULL) {
1524                 *ap->a_vpp = NULL;
1525                 return ENOENT;
1526         }
1527         chain = hammer2_inode_lock_ex(ip);
1528         *ap->a_vpp = hammer2_igetv(ip, &error);
1529         hammer2_inode_unlock_ex(ip, chain);
1530
1531         return error;
1532 }
1533
1534 static
1535 int
1536 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1537 {
1538         hammer2_chain_t *nchain;
1539         hammer2_mount_t *hmp;
1540         hammer2_inode_t *dip;
1541         hammer2_inode_t *nip;
1542         struct namecache *ncp;
1543         const uint8_t *name;
1544         size_t name_len;
1545         int error;
1546
1547         dip = VTOI(ap->a_dvp);
1548         hmp = dip->hmp;
1549         if (hmp->ronly)
1550                 return (EROFS);
1551
1552         ncp = ap->a_nch->ncp;
1553         name = ncp->nc_name;
1554         name_len = ncp->nc_nlen;
1555
1556         error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1557                                      name, name_len, &nip, &nchain);
1558         if (error) {
1559                 KKASSERT(nip == NULL);
1560                 *ap->a_vpp = NULL;
1561                 return error;
1562         }
1563         *ap->a_vpp = hammer2_igetv(nip, &error);
1564         hammer2_inode_unlock_ex(nip, nchain);
1565
1566         if (error == 0) {
1567                 cache_setunresolved(ap->a_nch);
1568                 cache_setvp(ap->a_nch, *ap->a_vpp);
1569         }
1570         return error;
1571 }
1572
1573 /*
1574  * Return the largest contiguous physical disk range for the logical
1575  * request.
1576  *
1577  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
1578  */
1579 static
1580 int
1581 hammer2_vop_bmap(struct vop_bmap_args *ap)
1582 {
1583         struct vnode *vp;
1584         hammer2_mount_t *hmp;
1585         hammer2_inode_t *ip;
1586         hammer2_chain_t *parent;
1587         hammer2_chain_t *chain;
1588         hammer2_key_t lbeg;
1589         hammer2_key_t lend;
1590         hammer2_off_t pbeg;
1591         hammer2_off_t pbytes;
1592         hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
1593         int loff;
1594         int ai;
1595
1596         /*
1597          * Only supported on regular files
1598          *
1599          * Only supported for read operations (required for cluster_read).
1600          * The block allocation is delayed for write operations.
1601          */
1602         vp = ap->a_vp;
1603         if (vp->v_type != VREG)
1604                 return (EOPNOTSUPP);
1605         if (ap->a_cmd != BUF_CMD_READ)
1606                 return (EOPNOTSUPP);
1607
1608         ip = VTOI(vp);
1609         hmp = ip->hmp;
1610         bzero(array, sizeof(array));
1611
1612         /*
1613          * Calculate logical range
1614          */
1615         KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
1616         lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
1617         lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
1618         if (lend < lbeg)
1619                 lend = lbeg;
1620         loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
1621
1622         parent = hammer2_inode_lock_sh(ip);
1623         chain = hammer2_chain_lookup(hmp, &parent,
1624                                      lbeg, lend,
1625                                      HAMMER2_LOOKUP_NODATA |
1626                                      HAMMER2_LOOKUP_SHARED);
1627         if (chain == NULL) {
1628                 *ap->a_doffsetp = ZFOFFSET;
1629                 hammer2_inode_unlock_sh(ip, parent);
1630                 return (0);
1631         }
1632
1633         while (chain) {
1634                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1635                         ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
1636                         KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
1637                         array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
1638                         array[ai][1] = chain->bytes;
1639                 }
1640                 chain = hammer2_chain_next(hmp, &parent, chain,
1641                                            lbeg, lend,
1642                                            HAMMER2_LOOKUP_NODATA |
1643                                            HAMMER2_LOOKUP_SHARED);
1644         }
1645         hammer2_inode_unlock_sh(ip, parent);
1646
1647         /*
1648          * If the requested loffset is not mappable physically we can't
1649          * bmap.  The caller will have to access the file data via a
1650          * device buffer.
1651          */
1652         if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_LBUFSIZE) {
1653                 *ap->a_doffsetp = NOOFFSET;
1654                 return (0);
1655         }
1656
1657         /*
1658          * Calculate the physical disk offset range for array[0]
1659          */
1660         pbeg = array[0][0] + loff;
1661         pbytes = array[0][1] - loff;
1662
1663         for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
1664                 if (array[ai][0] != pbeg + pbytes)
1665                         break;
1666                 pbytes += array[ai][1];
1667         }
1668
1669         *ap->a_doffsetp = pbeg;
1670         if (ap->a_runp)
1671                 *ap->a_runp = pbytes;
1672         return (0);
1673 }
1674
1675 static
1676 int
1677 hammer2_vop_open(struct vop_open_args *ap)
1678 {
1679         return vop_stdopen(ap);
1680 }
1681
1682 /*
1683  * hammer2_vop_advlock { vp, id, op, fl, flags }
1684  */
1685 static
1686 int
1687 hammer2_vop_advlock(struct vop_advlock_args *ap)
1688 {
1689         hammer2_inode_t *ip = VTOI(ap->a_vp);
1690         hammer2_chain_t *chain;
1691         hammer2_off_t size;
1692
1693         chain = hammer2_inode_lock_sh(ip);
1694         size = chain->data->ipdata.size;
1695         hammer2_inode_unlock_sh(ip, chain);
1696         return (lf_advlock(ap, &ip->advlock, size));
1697 }
1698
1699
1700 static
1701 int
1702 hammer2_vop_close(struct vop_close_args *ap)
1703 {
1704         return vop_stdclose(ap);
1705 }
1706
1707 /*
1708  * hammer2_vop_nlink { nch, dvp, vp, cred }
1709  *
1710  * Create a hardlink from (vp) to {dvp, nch}.
1711  */
1712 static
1713 int
1714 hammer2_vop_nlink(struct vop_nlink_args *ap)
1715 {
1716         hammer2_inode_t *dip;   /* target directory to create link in */
1717         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1718         hammer2_inode_t *oip;
1719         hammer2_mount_t *hmp;
1720         hammer2_chain_t *chain;
1721         hammer2_chain_t *ochain;
1722         struct namecache *ncp;
1723         const uint8_t *name;
1724         size_t name_len;
1725         int error;
1726
1727         dip = VTOI(ap->a_dvp);
1728         hmp = dip->hmp;
1729         if (hmp->ronly)
1730                 return (EROFS);
1731
1732         /*
1733          * (ip) is the inode we are linking to.
1734          */
1735         ip = oip = VTOI(ap->a_vp);
1736         hammer2_inode_ref(ip);
1737
1738         ncp = ap->a_nch->ncp;
1739         name = ncp->nc_name;
1740         name_len = ncp->nc_nlen;
1741
1742         /*
1743          * Create a consolidated real file for the hardlink, adjust (ip),
1744          * and move the nlinks lock if necessary.  Tell the function to
1745          * bump the hardlink count on the consolidated file.
1746          */
1747         error = hammer2_hardlink_consolidate(&ip, dip);
1748         if (error)
1749                 goto done;
1750
1751         /*
1752          * If the consolidation changed ip to a HARDLINK pointer we have
1753          * to adjust the vnode to point to the actual ip.
1754          *
1755          * XXX this can race against concurrent vnode ops.
1756          */
1757         if (oip != ip) {
1758                 hammer2_inode_ref(ip);                  /* vp ref+ */
1759                 chain = hammer2_inode_lock_ex(ip);
1760                 ochain = hammer2_inode_lock_ex(oip);
1761                 if (ip->vp) {
1762                         KKASSERT(ip->vp == ap->a_vp);
1763                         hammer2_inode_drop(ip);         /* vp already ref'd */
1764                 } else {
1765                         ip->vp = ap->a_vp;
1766                         ap->a_vp->v_data = ip;
1767                 }
1768                 if (oip->vp) {
1769                         KKASSERT(oip->vp == ap->a_vp);
1770                         oip->vp = NULL;
1771                         hammer2_inode_drop(oip);        /* vp ref- */
1772                 }
1773                 hammer2_inode_unlock_ex(oip, ochain);
1774                 hammer2_inode_unlock_ex(ip, chain);
1775         }
1776
1777         /*
1778          * The act of connecting the existing (ip) will properly bump the
1779          * nlinks count.  However, vp will incorrectly point at the old
1780          * inode which has now been turned into a OBJTYPE_HARDLINK pointer.
1781          *
1782          * We must reconnect the vp.
1783          */
1784         error = hammer2_inode_connect(dip, ip, name, name_len);
1785         if (error == 0) {
1786                 cache_setunresolved(ap->a_nch);
1787                 cache_setvp(ap->a_nch, ap->a_vp);
1788         }
1789 done:
1790         hammer2_inode_drop(ip);
1791         return error;
1792 }
1793
1794 /*
1795  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1796  *
1797  * The operating system has already ensured that the directory entry
1798  * does not exist and done all appropriate namespace locking.
1799  */
1800 static
1801 int
1802 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1803 {
1804         hammer2_mount_t *hmp;
1805         hammer2_inode_t *dip;
1806         hammer2_inode_t *nip;
1807         hammer2_chain_t *nchain;
1808         struct namecache *ncp;
1809         const uint8_t *name;
1810         size_t name_len;
1811         int error;
1812
1813         dip = VTOI(ap->a_dvp);
1814         hmp = dip->hmp;
1815         if (hmp->ronly)
1816                 return (EROFS);
1817
1818         ncp = ap->a_nch->ncp;
1819         name = ncp->nc_name;
1820         name_len = ncp->nc_nlen;
1821
1822         error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1823                                      name, name_len, &nip, &nchain);
1824         if (error) {
1825                 KKASSERT(nip == NULL);
1826                 *ap->a_vpp = NULL;
1827                 return error;
1828         }
1829         *ap->a_vpp = hammer2_igetv(nip, &error);
1830         hammer2_inode_unlock_ex(nip, nchain);
1831
1832         if (error == 0) {
1833                 cache_setunresolved(ap->a_nch);
1834                 cache_setvp(ap->a_nch, *ap->a_vpp);
1835         }
1836         return error;
1837 }
1838
1839 /*
1840  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1841  */
1842 static
1843 int
1844 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1845 {
1846         hammer2_mount_t *hmp;
1847         hammer2_inode_t *dip;
1848         hammer2_inode_t *nip;
1849         hammer2_chain_t *nchain;
1850         struct namecache *ncp;
1851         const uint8_t *name;
1852         size_t name_len;
1853         int error;
1854
1855         dip = VTOI(ap->a_dvp);
1856         hmp = dip->hmp;
1857         if (hmp->ronly)
1858                 return (EROFS);
1859
1860         ncp = ap->a_nch->ncp;
1861         name = ncp->nc_name;
1862         name_len = ncp->nc_nlen;
1863
1864         ap->a_vap->va_type = VLNK;      /* enforce type */
1865
1866         error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1867                                      name, name_len, &nip, &nchain);
1868         if (error) {
1869                 KKASSERT(nip == NULL);
1870                 *ap->a_vpp = NULL;
1871                 return error;
1872         }
1873         *ap->a_vpp = hammer2_igetv(nip, &error);
1874
1875         /*
1876          * Build the softlink (~like file data) and finalize the namecache.
1877          */
1878         if (error == 0) {
1879                 size_t bytes;
1880                 struct uio auio;
1881                 struct iovec aiov;
1882                 hammer2_inode_data_t *nipdata;
1883
1884                 nipdata = &nchain->data->ipdata;
1885                 bytes = strlen(ap->a_target);
1886
1887                 if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1888                         KKASSERT(nipdata->op_flags &
1889                                  HAMMER2_OPFLAG_DIRECTDATA);
1890                         bcopy(ap->a_target, nipdata->u.data, bytes);
1891                         nipdata->size = bytes;
1892                 } else {
1893                         bzero(&auio, sizeof(auio));
1894                         bzero(&aiov, sizeof(aiov));
1895                         auio.uio_iov = &aiov;
1896                         auio.uio_segflg = UIO_SYSSPACE;
1897                         auio.uio_rw = UIO_WRITE;
1898                         auio.uio_resid = bytes;
1899                         auio.uio_iovcnt = 1;
1900                         auio.uio_td = curthread;
1901                         aiov.iov_base = ap->a_target;
1902                         aiov.iov_len = bytes;
1903                         error = hammer2_write_file(nip, &nchain,
1904                                                    &auio, IO_APPEND, 0);
1905                         /* XXX handle error */
1906                         error = 0;
1907                 }
1908         }
1909         hammer2_inode_unlock_ex(nip, nchain);
1910
1911         /*
1912          * Finalize namecache
1913          */
1914         if (error == 0) {
1915                 cache_setunresolved(ap->a_nch);
1916                 cache_setvp(ap->a_nch, *ap->a_vpp);
1917                 /* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
1918         }
1919         return error;
1920 }
1921
1922 /*
1923  * hammer2_vop_nremove { nch, dvp, cred }
1924  */
1925 static
1926 int
1927 hammer2_vop_nremove(struct vop_nremove_args *ap)
1928 {
1929         hammer2_inode_t *dip;
1930         hammer2_mount_t *hmp;
1931         struct namecache *ncp;
1932         const uint8_t *name;
1933         size_t name_len;
1934         int error;
1935
1936         dip = VTOI(ap->a_dvp);
1937         hmp = dip->hmp;
1938         if (hmp->ronly)
1939                 return(EROFS);
1940
1941         ncp = ap->a_nch->ncp;
1942         name = ncp->nc_name;
1943         name_len = ncp->nc_nlen;
1944
1945         error = hammer2_unlink_file(dip, name, name_len, 0, NULL);
1946         if (error == 0) {
1947                 cache_unlink(ap->a_nch);
1948         }
1949         return (error);
1950 }
1951
1952 /*
1953  * hammer2_vop_nrmdir { nch, dvp, cred }
1954  */
1955 static
1956 int
1957 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
1958 {
1959         hammer2_inode_t *dip;
1960         hammer2_mount_t *hmp;
1961         struct namecache *ncp;
1962         const uint8_t *name;
1963         size_t name_len;
1964         int error;
1965
1966         dip = VTOI(ap->a_dvp);
1967         hmp = dip->hmp;
1968         if (hmp->ronly)
1969                 return(EROFS);
1970
1971         ncp = ap->a_nch->ncp;
1972         name = ncp->nc_name;
1973         name_len = ncp->nc_nlen;
1974
1975         error = hammer2_unlink_file(dip, name, name_len, 1, NULL);
1976         if (error == 0) {
1977                 cache_unlink(ap->a_nch);
1978         }
1979         return (error);
1980 }
1981
1982 /*
1983  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1984  */
1985 static
1986 int
1987 hammer2_vop_nrename(struct vop_nrename_args *ap)
1988 {
1989         struct namecache *fncp;
1990         struct namecache *tncp;
1991         hammer2_inode_t *fdip;
1992         hammer2_inode_t *tdip;
1993         hammer2_inode_t *ip;
1994         hammer2_chain_t *chain;
1995         hammer2_mount_t *hmp;
1996         const uint8_t *fname;
1997         size_t fname_len;
1998         const uint8_t *tname;
1999         size_t tname_len;
2000         int error;
2001
2002         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
2003                 return(EXDEV);
2004         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
2005                 return(EXDEV);
2006
2007         fdip = VTOI(ap->a_fdvp);        /* source directory */
2008         tdip = VTOI(ap->a_tdvp);        /* target directory */
2009
2010         hmp = fdip->hmp;                /* check read-only filesystem */
2011         if (hmp->ronly)
2012                 return(EROFS);
2013
2014         fncp = ap->a_fnch->ncp;         /* entry name in source */
2015         fname = fncp->nc_name;
2016         fname_len = fncp->nc_nlen;
2017
2018         tncp = ap->a_tnch->ncp;         /* entry name in target */
2019         tname = tncp->nc_name;
2020         tname_len = tncp->nc_nlen;
2021
2022         /*
2023          * ip is the inode being removed.  If this is a hardlink then
2024          * ip represents the actual file and not the hardlink marker.
2025          */
2026         ip = VTOI(fncp->nc_vp);
2027
2028         /*
2029          * Keep a tight grip on the inode as removing it should disconnect
2030          * it and we don't want to destroy it.
2031          *
2032          * NOTE: To avoid deadlocks we cannot lock (ip) while we are
2033          *       unlinking elements from their directories.  Locking
2034          *       the nlinks field does not lock the whole inode.
2035          */
2036         hammer2_inode_ref(ip);
2037
2038         /*
2039          * Remove target if it exists
2040          */
2041         error = hammer2_unlink_file(tdip, tname, tname_len, -1, NULL);
2042         if (error && error != ENOENT)
2043                 goto done;
2044         cache_setunresolved(ap->a_tnch);
2045
2046         /*
2047          * Disconnect (fdip, fname) from the source directory.  This will
2048          * disconnect (ip) if it represents a direct file.  If (ip) represents
2049          * a hardlink the HARDLINK pointer object will be removed but the
2050          * hardlink will stay intact.
2051          *
2052          * If (ip) is already hardlinked we have to resolve to a consolidated
2053          * file but we do not bump the nlinks count.  (ip) must hold the nlinks
2054          * lock & ref for the operation.  If the consolidated file has been
2055          * relocated (ip) will be adjusted and the related nlinks lock moved
2056          * along with it.
2057          *
2058          * If (ip) does not have multiple links we can just copy the physical
2059          * contents of the inode.
2060          */
2061         chain = hammer2_inode_lock_sh(ip);
2062         hammer2_chain_ref(hmp, chain);          /* for unlink file */
2063         if (chain->data->ipdata.nlinks > 1) {
2064                 hammer2_inode_unlock_sh(ip, chain);
2065                 error = hammer2_hardlink_consolidate(&ip, tdip);
2066                 if (error)
2067                         goto done;
2068         } else {
2069                 hammer2_inode_unlock_sh(ip, chain);
2070         }
2071         /* chain ref still intact */
2072
2073         /*
2074          * NOTE! Because we are retaining (ip) the unlink can fail with
2075          *       an EAGAIN.
2076          */
2077         for (;;) {
2078                 error = hammer2_unlink_file(fdip, fname, fname_len, -1, chain);
2079                 if (error != EAGAIN)
2080                         break;
2081                 kprintf("hammer2_vop_nrename: unlink race %s\n", fname);
2082                 tsleep(fdip, 0, "h2renr", 1);
2083         }
2084         hammer2_chain_drop(hmp, chain); /* drop temporary ref */
2085         if (error)
2086                 goto done;
2087
2088         /*
2089          * Reconnect ip to target directory.
2090          *
2091          * WARNING: chain locks can lock buffer cache buffers, to avoid
2092          *          deadlocks we want to unlock before issuing a cache_*()
2093          *          op (that might have to lock a vnode).
2094          */
2095         error = hammer2_inode_connect(tdip, ip, tname, tname_len);
2096         if (error == 0) {
2097                 cache_rename(ap->a_fnch, ap->a_tnch);
2098         }
2099 done:
2100         hammer2_inode_drop(ip);
2101
2102         return (error);
2103 }
2104
2105 static int hammer2_strategy_read(struct vop_strategy_args *ap);
2106 static int hammer2_strategy_write(struct vop_strategy_args *ap);
2107
2108 static
2109 int
2110 hammer2_vop_strategy(struct vop_strategy_args *ap)
2111 {
2112         struct bio *biop;
2113         struct buf *bp;
2114         int error;
2115
2116         biop = ap->a_bio;
2117         bp = biop->bio_buf;
2118
2119         switch(bp->b_cmd) {
2120         case BUF_CMD_READ:
2121                 error = hammer2_strategy_read(ap);
2122                 ++hammer2_iod_file_read;
2123                 break;
2124         case BUF_CMD_WRITE:
2125                 error = hammer2_strategy_write(ap);
2126                 ++hammer2_iod_file_write;
2127                 break;
2128         default:
2129                 bp->b_error = error = EINVAL;
2130                 bp->b_flags |= B_ERROR;
2131                 biodone(biop);
2132                 break;
2133         }
2134
2135         return (error);
2136 }
2137
2138 static
2139 int
2140 hammer2_strategy_read(struct vop_strategy_args *ap)
2141 {
2142         struct buf *bp;
2143         struct bio *bio;
2144         struct bio *nbio;
2145         hammer2_mount_t *hmp;
2146         hammer2_inode_t *ip;
2147         hammer2_chain_t *parent;
2148         hammer2_chain_t *chain;
2149         hammer2_key_t lbase;
2150
2151         bio = ap->a_bio;
2152         bp = bio->bio_buf;
2153         ip = VTOI(ap->a_vp);
2154         hmp = ip->hmp;
2155         nbio = push_bio(bio);
2156
2157         lbase = bio->bio_offset;
2158         chain = NULL;
2159         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
2160
2161         /*
2162          * We must characterize the logical->physical translation if it
2163          * has not already been cached.
2164          *
2165          * Physical data references < LBUFSIZE are never cached.  This
2166          * includes both small-block allocations and inode-embedded data.
2167          */
2168         if (nbio->bio_offset == NOOFFSET) {
2169                 parent = hammer2_inode_lock_sh(ip);
2170
2171                 chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
2172                                              HAMMER2_LOOKUP_NODATA |
2173                                              HAMMER2_LOOKUP_SHARED);
2174                 if (chain == NULL) {
2175                         /*
2176                          * Data is zero-fill
2177                          */
2178                         nbio->bio_offset = ZFOFFSET;
2179                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2180                         /*
2181                          * Data is embedded in the inode (do nothing)
2182                          */
2183                         KKASSERT(chain == parent);
2184                         hammer2_chain_unlock(hmp, chain);
2185                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2186                         /*
2187                          * Data is on-media
2188                          */
2189                         KKASSERT(bp->b_bcount == chain->bytes);
2190                         nbio->bio_offset = chain->bref.data_off &
2191                                            HAMMER2_OFF_MASK;
2192                         hammer2_chain_unlock(hmp, chain);
2193                         KKASSERT(nbio->bio_offset != 0);
2194                 } else {
2195                         panic("hammer2_strategy_read: unknown bref type");
2196                 }
2197                 hammer2_inode_unlock_sh(ip, parent);
2198         }
2199
2200         if (hammer2_debug & 0x0020) {
2201                 kprintf("read %016jx %016jx\n",
2202                         bio->bio_offset, nbio->bio_offset);
2203         }
2204
2205         if (nbio->bio_offset == ZFOFFSET) {
2206                 /*
2207                  * Data is zero-fill
2208                  */
2209                 bp->b_resid = 0;
2210                 bp->b_error = 0;
2211                 bzero(bp->b_data, bp->b_bcount);
2212                 biodone(nbio);
2213         } else if (nbio->bio_offset != NOOFFSET) {
2214                 /*
2215                  * Forward direct IO to the device
2216                  */
2217                 vn_strategy(hmp->devvp, nbio);
2218         } else {
2219                 /*
2220                  * Data is embedded in inode.
2221                  */
2222                 bcopy(chain->data->ipdata.u.data, bp->b_data,
2223                       HAMMER2_EMBEDDED_BYTES);
2224                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
2225                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
2226                 bp->b_resid = 0;
2227                 bp->b_error = 0;
2228                 biodone(nbio);
2229         }
2230         return (0);
2231 }
2232
2233 static
2234 int
2235 hammer2_strategy_write(struct vop_strategy_args *ap)
2236 {
2237         struct buf *bp;
2238         struct bio *bio;
2239         struct bio *nbio;
2240         hammer2_mount_t *hmp;
2241         hammer2_inode_t *ip;
2242
2243         bio = ap->a_bio;
2244         bp = bio->bio_buf;
2245         ip = VTOI(ap->a_vp);
2246         hmp = ip->hmp;
2247         nbio = push_bio(bio);
2248
2249         KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
2250         KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
2251
2252         if (nbio->bio_offset == NOOFFSET) {
2253                 /*
2254                  * Must be embedded in the inode.
2255                  *
2256                  * Because the inode is dirty, the chain must exist whether
2257                  * the inode is locked or not. XXX
2258                  */
2259                 KKASSERT(bio->bio_offset == 0);
2260                 KKASSERT(ip->chain && ip->chain->data);
2261                 bcopy(bp->b_data, ip->chain->data->ipdata.u.data,
2262                       HAMMER2_EMBEDDED_BYTES);
2263                 bp->b_resid = 0;
2264                 bp->b_error = 0;
2265                 biodone(nbio);
2266
2267                 /*
2268                  * This special flag does not follow the normal MODIFY rules
2269                  * because we might deadlock on ip.  Instead we depend on
2270                  * VOP_FSYNC() to detect the case.
2271                  */
2272                 atomic_set_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
2273         } else {
2274                 /*
2275                  * Forward direct IO to the device
2276                  */
2277                 vn_strategy(hmp->devvp, nbio);
2278         }
2279         return (0);
2280 }
2281
2282 /*
2283  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2284  */
2285 static
2286 int
2287 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2288 {
2289         hammer2_mount_t *hmp;
2290         hammer2_inode_t *ip;
2291         int error;
2292
2293         ip = VTOI(ap->a_vp);
2294         hmp = ip->hmp;
2295
2296         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2297                               ap->a_fflag, ap->a_cred);
2298         return (error);
2299 }
2300
2301 static
2302 int 
2303 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2304 {
2305         struct mount *mp;
2306         hammer2_pfsmount_t *pmp;
2307         int rc;
2308
2309         switch (ap->a_op) {
2310         case (MOUNTCTL_SET_EXPORT):
2311                 mp = ap->a_head.a_ops->head.vv_mount;
2312                 pmp = MPTOPMP(mp);
2313
2314                 if (ap->a_ctllen != sizeof(struct export_args))
2315                         rc = (EINVAL);
2316                 else
2317                         rc = vfs_export(mp, &pmp->export,
2318                                         (const struct export_args *)ap->a_ctl);
2319                 break;
2320         default:
2321                 rc = vop_stdmountctl(ap);
2322                 break;
2323         }
2324         return (rc);
2325 }
2326
2327 struct vop_ops hammer2_vnode_vops = {
2328         .vop_default    = vop_defaultop,
2329         .vop_fsync      = hammer2_vop_fsync,
2330         .vop_getpages   = vop_stdgetpages,
2331         .vop_putpages   = vop_stdputpages,
2332         .vop_access     = hammer2_vop_access,
2333         .vop_advlock    = hammer2_vop_advlock,
2334         .vop_close      = hammer2_vop_close,
2335         .vop_nlink      = hammer2_vop_nlink,
2336         .vop_ncreate    = hammer2_vop_ncreate,
2337         .vop_nsymlink   = hammer2_vop_nsymlink,
2338         .vop_nremove    = hammer2_vop_nremove,
2339         .vop_nrmdir     = hammer2_vop_nrmdir,
2340         .vop_nrename    = hammer2_vop_nrename,
2341         .vop_getattr    = hammer2_vop_getattr,
2342         .vop_setattr    = hammer2_vop_setattr,
2343         .vop_readdir    = hammer2_vop_readdir,
2344         .vop_readlink   = hammer2_vop_readlink,
2345         .vop_getpages   = vop_stdgetpages,
2346         .vop_putpages   = vop_stdputpages,
2347         .vop_read       = hammer2_vop_read,
2348         .vop_write      = hammer2_vop_write,
2349         .vop_open       = hammer2_vop_open,
2350         .vop_inactive   = hammer2_vop_inactive,
2351         .vop_reclaim    = hammer2_vop_reclaim,
2352         .vop_nresolve   = hammer2_vop_nresolve,
2353         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2354         .vop_nmkdir     = hammer2_vop_nmkdir,
2355         .vop_ioctl      = hammer2_vop_ioctl,
2356         .vop_mountctl   = hammer2_vop_mountctl,
2357         .vop_bmap       = hammer2_vop_bmap,
2358         .vop_strategy   = hammer2_vop_strategy,
2359 };
2360
2361 struct vop_ops hammer2_spec_vops = {
2362
2363 };
2364
2365 struct vop_ops hammer2_fifo_vops = {
2366
2367 };