Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
39 #include <sys/buf.h>
40 #include <sys/proc.h>
41 #include <sys/namei.h>
42 #include <sys/mount.h>
43 #include <sys/vnode.h>
44 #include <sys/mountctl.h>
45 #include <sys/dirent.h>
46 #include <sys/uio.h>
47
48 #include "hammer2.h"
49
50 #define ZFOFFSET        (-2LL)
51
52 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
53                                 int seqcount);
54 static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio, int ioflag,
55                               int seqcount);
56 static hammer2_off_t hammer2_assign_physical(hammer2_inode_t *ip,
57                                 hammer2_key_t lbase, int lblksize, int *errorp);
58 static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize);
59 static void hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize);
60
61 /*
62  * Last reference to a vnode is going away but it is still cached.
63  */
64 static
65 int
66 hammer2_vop_inactive(struct vop_inactive_args *ap)
67 {
68         struct vnode *vp;
69         struct hammer2_inode *ip;
70 #if 0
71         struct hammer2_mount *hmp;
72 #endif
73
74         vp = ap->a_vp;
75         ip = VTOI(vp);
76
77         /*
78          * Degenerate case
79          */
80         if (ip == NULL) {
81                 vrecycle(vp);
82                 return (0);
83         }
84
85         /*
86          * Detect updates to the embedded data which may be synchronized by
87          * the strategy code.  Simply mark the inode modified so it gets
88          * picked up by our normal flush.
89          */
90         if (ip->chain.flags & HAMMER2_CHAIN_DIRTYEMBED) {
91                 hammer2_inode_lock_ex(ip);
92                 atomic_clear_int(&ip->chain.flags, HAMMER2_CHAIN_DIRTYEMBED);
93                 hammer2_chain_modify(ip->hmp, &ip->chain, 0);
94                 hammer2_inode_unlock_ex(ip);
95         }
96
97         /*
98          * Check for deleted inodes and recycle immediately.
99          */
100         if (ip->chain.flags & HAMMER2_CHAIN_DELETED) {
101                 vrecycle(vp);
102         }
103         return (0);
104 }
105
106 /*
107  * Reclaim a vnode so that it can be reused; after the inode is
108  * disassociated, the filesystem must manage it alone.
109  */
110 static
111 int
112 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
113 {
114         struct hammer2_inode *ip;
115         struct hammer2_mount *hmp;
116         struct vnode *vp;
117
118         vp = ap->a_vp;
119         ip = VTOI(vp);
120         if (ip == NULL)
121                 return(0);
122         hmp = ip->hmp;
123
124         /*
125          * Set SUBMODIFIED so we can detect and propagate the DESTROYED
126          * bit in the flush code.
127          */
128         hammer2_inode_lock_ex(ip);
129         vp->v_data = NULL;
130         ip->vp = NULL;
131         if (ip->chain.flags & HAMMER2_CHAIN_DELETED) {
132                 atomic_set_int(&ip->chain.flags, HAMMER2_CHAIN_DESTROYED |
133                                                  HAMMER2_CHAIN_SUBMODIFIED);
134         }
135         hammer2_chain_flush(hmp, &ip->chain);
136         hammer2_inode_unlock_ex(ip);
137         hammer2_chain_drop(hmp, &ip->chain);    /* vp ref */
138
139         /*
140          * XXX handle background sync when ip dirty, kernel will no longer
141          * notify us regarding this inode because there is no longer a
142          * vnode attached to it.
143          */
144
145         return (0);
146 }
147
148 static
149 int
150 hammer2_vop_fsync(struct vop_fsync_args *ap)
151 {
152         struct hammer2_inode *ip;
153         struct hammer2_mount *hmp;
154         struct vnode *vp;
155
156         vp = ap->a_vp;
157         ip = VTOI(vp);
158         hmp = ip->hmp;
159
160         hammer2_inode_lock_ex(ip);
161         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
162
163         /*
164          * Detect updates to the embedded data which may be synchronized by
165          * the strategy code.  Simply mark the inode modified so it gets
166          * picked up by our normal flush.
167          */
168         if (ip->chain.flags & HAMMER2_CHAIN_DIRTYEMBED) {
169                 atomic_clear_int(&ip->chain.flags, HAMMER2_CHAIN_DIRTYEMBED);
170                 hammer2_chain_modify(hmp, &ip->chain, 0);
171         }
172
173         /*
174          * Calling chain_flush here creates a lot of duplicative
175          * COW operations due to non-optimal vnode ordering.
176          *
177          * Only do it for an actual fsync() syscall.  The other forms
178          * which call this function will eventually call chain_flush
179          * on the volume root as a catch-all, which is far more optimal.
180          */
181         if (ap->a_flags & VOP_FSYNC_SYSCALL)
182                 hammer2_chain_flush(hmp, &ip->chain);
183         hammer2_inode_unlock_ex(ip);
184         return (0);
185 }
186
187 static
188 int
189 hammer2_vop_access(struct vop_access_args *ap)
190 {
191         hammer2_inode_t *ip = VTOI(ap->a_vp);
192         uid_t uid;
193         gid_t gid;
194         int error;
195
196         uid = hammer2_to_unix_xid(&ip->ip_data.uid);
197         gid = hammer2_to_unix_xid(&ip->ip_data.gid);
198
199         error = vop_helper_access(ap, uid, gid, ip->ip_data.mode,
200                                   ip->ip_data.uflags);
201         return (error);
202 }
203
204 static
205 int
206 hammer2_vop_getattr(struct vop_getattr_args *ap)
207 {
208         hammer2_pfsmount_t *pmp;
209         hammer2_inode_t *ip;
210         struct vnode *vp;
211         struct vattr *vap;
212
213         vp = ap->a_vp;
214         vap = ap->a_vap;
215
216         ip = VTOI(vp);
217         pmp = ip->pmp;
218
219         hammer2_inode_lock_sh(ip);
220
221         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
222         vap->va_fileid = ip->ip_data.inum;
223         vap->va_mode = ip->ip_data.mode;
224         vap->va_nlink = ip->ip_data.nlinks;
225         vap->va_uid = 0;
226         vap->va_gid = 0;
227         vap->va_rmajor = 0;
228         vap->va_rminor = 0;
229         vap->va_size = ip->ip_data.size;
230         vap->va_blocksize = HAMMER2_PBUFSIZE;
231         vap->va_flags = ip->ip_data.uflags;
232         hammer2_time_to_timespec(ip->ip_data.ctime, &vap->va_ctime);
233         hammer2_time_to_timespec(ip->ip_data.mtime, &vap->va_mtime);
234         hammer2_time_to_timespec(ip->ip_data.mtime, &vap->va_atime);
235         vap->va_gen = 1;
236         vap->va_bytes = vap->va_size;   /* XXX */
237         vap->va_type = hammer2_get_vtype(ip);
238         vap->va_filerev = 0;
239         vap->va_uid_uuid = ip->ip_data.uid;
240         vap->va_gid_uuid = ip->ip_data.gid;
241         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
242                           VA_FSID_UUID_VALID;
243
244         hammer2_inode_unlock_sh(ip);
245
246         return (0);
247 }
248
249 static
250 int
251 hammer2_vop_setattr(struct vop_setattr_args *ap)
252 {
253         hammer2_mount_t *hmp;
254         hammer2_inode_t *ip;
255         struct vnode *vp;
256         struct vattr *vap;
257         int error;
258         int kflags = 0;
259         int doctime = 0;
260         int domtime = 0;
261
262         vp = ap->a_vp;
263         vap = ap->a_vap;
264
265         ip = VTOI(vp);
266         hmp = ip->hmp;
267
268         if (hmp->ronly)
269                 return(EROFS);
270
271         hammer2_inode_lock_ex(ip);
272         error = 0;
273
274         if (vap->va_flags != VNOVAL) {
275                 u_int32_t flags;
276
277                 flags = ip->ip_data.uflags;
278                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
279                                          hammer2_to_unix_xid(&ip->ip_data.uid),
280                                          ap->a_cred);
281                 if (error == 0) {
282                         if (ip->ip_data.uflags != flags) {
283                                 hammer2_chain_modify(hmp, &ip->chain, 0);
284                                 ip->ip_data.uflags = flags;
285                                 doctime = 1;
286                                 kflags |= NOTE_ATTRIB;
287                         }
288                         if (ip->ip_data.uflags & (IMMUTABLE | APPEND)) {
289                                 error = 0;
290                                 goto done;
291                         }
292                 }
293         }
294
295         if (ip->ip_data.uflags & (IMMUTABLE | APPEND)) {
296                 error = EPERM;
297                 goto done;
298         }
299         /* uid, gid */
300
301         /*
302          * Resize the file
303          */
304         if (vap->va_size != VNOVAL && ip->ip_data.size != vap->va_size) {
305                 switch(vp->v_type) {
306                 case VREG:
307                         if (vap->va_size == ip->ip_data.size)
308                                 break;
309                         if (vap->va_size < ip->ip_data.size) {
310                                 hammer2_truncate_file(ip, vap->va_size);
311                         } else {
312                                 hammer2_extend_file(ip, vap->va_size);
313                         }
314                         domtime = 1;
315                         break;
316                 default:
317                         error = EINVAL;
318                         goto done;
319                 }
320         }
321 done:
322         hammer2_inode_unlock_ex(ip);
323         return (error);
324 }
325
326 static
327 int
328 hammer2_vop_readdir(struct vop_readdir_args *ap)
329 {
330         hammer2_mount_t *hmp;
331         hammer2_inode_t *ip;
332         hammer2_inode_t *xip;
333         hammer2_chain_t *parent;
334         hammer2_chain_t *chain;
335         hammer2_key_t lkey;
336         struct uio *uio;
337         off_t *cookies;
338         off_t saveoff;
339         int cookie_index;
340         int ncookies;
341         int error;
342         int dtype;
343         int r;
344
345         ip = VTOI(ap->a_vp);
346         hmp = ip->hmp;
347         uio = ap->a_uio;
348         saveoff = uio->uio_offset;
349
350         /*
351          * Setup cookies directory entry cookies if requested
352          */
353         if (ap->a_ncookies) {
354                 ncookies = uio->uio_resid / 16 + 1;
355                 if (ncookies > 1024)
356                         ncookies = 1024;
357                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
358         } else {
359                 ncookies = -1;
360                 cookies = NULL;
361         }
362         cookie_index = 0;
363
364         /*
365          * Handle artificial entries.  To ensure that only positive 64 bit
366          * quantities are returned to userland we always strip off bit 63.
367          * The hash code is designed such that codes 0x0000-0x7FFF are not
368          * used, allowing us to use these codes for articial entries.
369          *
370          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
371          * allow '..' to cross the mount point into (e.g.) the super-root.
372          */
373         error = 0;
374         chain = (void *)(intptr_t)-1;   /* non-NULL for early goto done case */
375
376         if (saveoff == 0) {
377                 r = vop_write_dirent(&error, uio,
378                                      ip->ip_data.inum &
379                                         HAMMER2_DIRHASH_USERMSK,
380                                      DT_DIR, 1, ".");
381                 if (r)
382                         goto done;
383                 if (cookies)
384                         cookies[cookie_index] = saveoff;
385                 ++saveoff;
386                 ++cookie_index;
387                 if (cookie_index == ncookies)
388                         goto done;
389         }
390         if (saveoff == 1) {
391                 if (ip->pip == NULL || ip == ip->pmp->iroot)
392                         xip = ip;
393                 else
394                         xip = ip->pip;
395
396                 r = vop_write_dirent(&error, uio,
397                                      xip->ip_data.inum &
398                                       HAMMER2_DIRHASH_USERMSK,
399                                      DT_DIR, 2, "..");
400                 if (r)
401                         goto done;
402                 if (cookies)
403                         cookies[cookie_index] = saveoff;
404                 ++saveoff;
405                 ++cookie_index;
406                 if (cookie_index == ncookies)
407                         goto done;
408         }
409
410         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
411
412         parent = &ip->chain;
413         error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
414         if (error) {
415                 hammer2_chain_unlock(hmp, parent);
416                 goto done;
417         }
418         chain = hammer2_chain_lookup(hmp, &parent, lkey, lkey, 0);
419         if (chain == NULL) {
420                 chain = hammer2_chain_lookup(hmp, &parent,
421                                              lkey, (hammer2_key_t)-1, 0);
422         }
423         while (chain) {
424                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
425                         dtype = hammer2_get_dtype(chain->u.ip);
426                         saveoff = chain->bref.key & HAMMER2_DIRHASH_USERMSK;
427                         r = vop_write_dirent(&error, uio,
428                                              chain->u.ip->ip_data.inum &
429                                               HAMMER2_DIRHASH_USERMSK,
430                                              dtype, chain->u.ip->ip_data.name_len,
431                                              chain->u.ip->ip_data.filename);
432                         if (r)
433                                 break;
434                         if (cookies)
435                                 cookies[cookie_index] = saveoff;
436                         ++cookie_index;
437                 } else {
438                         /* XXX chain error */
439                         kprintf("bad chain type readdir %d\n",
440                                 chain->bref.type);
441                 }
442
443                 /*
444                  * Keys may not be returned in order so once we have a
445                  * placemarker (chain) the scan must allow the full range
446                  * or some entries will be missed.
447                  */
448                 chain = hammer2_chain_next(hmp, &parent, chain,
449                                            0, (hammer2_key_t)-1, 0);
450                 if (chain) {
451                         saveoff = (chain->bref.key &
452                                    HAMMER2_DIRHASH_USERMSK) + 1;
453                 } else {
454                         saveoff = (hammer2_key_t)-1;
455                 }
456                 if (cookie_index == ncookies)
457                         break;
458         }
459         if (chain)
460                 hammer2_chain_unlock(hmp, chain);
461         hammer2_chain_unlock(hmp, parent);
462 done:
463         if (ap->a_eofflag)
464                 *ap->a_eofflag = (chain == NULL);
465         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
466         if (error && cookie_index == 0) {
467                 if (cookies) {
468                         kfree(cookies, M_TEMP);
469                         *ap->a_ncookies = 0;
470                         *ap->a_cookies = NULL;
471                 }
472         } else {
473                 if (cookies) {
474                         *ap->a_ncookies = cookie_index;
475                         *ap->a_cookies = cookies;
476                 }
477         }
478         return (error);
479 }
480
481 /*
482  * hammer2_vop_readlink { vp, uio, cred }
483  */
484 static
485 int
486 hammer2_vop_readlink(struct vop_readlink_args *ap)
487 {
488         struct vnode *vp;
489         hammer2_mount_t *hmp;
490         hammer2_inode_t *ip;
491         int error;
492
493         vp = ap->a_vp;
494         if (vp->v_type != VLNK)
495                 return (EINVAL);
496         ip = VTOI(vp);
497         hmp = ip->hmp;
498
499         error = hammer2_read_file(ip, ap->a_uio, 0);
500         return (error);
501 }
502
503 static
504 int
505 hammer2_vop_read(struct vop_read_args *ap)
506 {
507         struct vnode *vp;
508         hammer2_mount_t *hmp;
509         hammer2_inode_t *ip;
510         struct uio *uio;
511         int error;
512         int seqcount;
513         int bigread;
514
515         /*
516          * Read operations supported on this vnode?
517          */
518         vp = ap->a_vp;
519         if (vp->v_type != VREG)
520                 return (EINVAL);
521
522         /*
523          * Misc
524          */
525         ip = VTOI(vp);
526         hmp = ip->hmp;
527         uio = ap->a_uio;
528         error = 0;
529
530         seqcount = ap->a_ioflag >> 16;
531         bigread = (uio->uio_resid > 100 * 1024 * 1024);
532
533         error = hammer2_read_file(ip, uio, seqcount);
534         return (error);
535 }
536
537 static
538 int
539 hammer2_vop_write(struct vop_write_args *ap)
540 {
541         thread_t td;
542         struct vnode *vp;
543         hammer2_mount_t *hmp;
544         hammer2_inode_t *ip;
545         struct uio *uio;
546         int error;
547         int seqcount;
548         int bigwrite;
549
550         /*
551          * Read operations supported on this vnode?
552          */
553         vp = ap->a_vp;
554         if (vp->v_type != VREG)
555                 return (EINVAL);
556
557         /*
558          * Misc
559          */
560         ip = VTOI(vp);
561         hmp = ip->hmp;
562         uio = ap->a_uio;
563         error = 0;
564         if (hmp->ronly)
565                 return (EROFS);
566
567         seqcount = ap->a_ioflag >> 16;
568         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
569
570         /*
571          * Check resource limit
572          */
573         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
574             uio->uio_offset + uio->uio_resid >
575              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
576                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
577                 return (EFBIG);
578         }
579
580         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
581
582         /*
583          * ip must be locked if extending the file.
584          * ip must be locked to avoid racing a truncation.
585          *
586          * ip must be marked modified, particularly because the write
587          * might wind up being copied into the embedded data area.
588          */
589         hammer2_inode_lock_ex(ip);
590         hammer2_chain_modify(hmp, &ip->chain, 0);
591         error = hammer2_write_file(ip, uio, ap->a_ioflag, seqcount);
592
593         hammer2_inode_unlock_ex(ip);
594         return (error);
595 }
596
597 /*
598  * Perform read operations on a file or symlink given an UNLOCKED
599  * inode and uio.
600  */
601 static
602 int
603 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
604 {
605         struct buf *bp;
606         int error;
607
608         error = 0;
609
610         /*
611          * UIO read loop
612          */
613         while (uio->uio_resid > 0 && uio->uio_offset < ip->ip_data.size) {
614                 hammer2_key_t lbase;
615                 hammer2_key_t leof;
616                 int lblksize;
617                 int loff;
618                 int n;
619
620                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
621                                                 &lbase, &leof);
622
623                 error = cluster_read(ip->vp, leof, lbase, lblksize,
624                                      uio->uio_resid, seqcount * BKVASIZE,
625                                      &bp);
626
627                 if (error)
628                         break;
629                 loff = (int)(uio->uio_offset - lbase);
630                 n = lblksize - loff;
631                 if (n > uio->uio_resid)
632                         n = uio->uio_resid;
633                 if (n > ip->ip_data.size - uio->uio_offset)
634                         n = (int)(ip->ip_data.size - uio->uio_offset);
635                 bp->b_flags |= B_AGE;
636                 uiomove((char *)bp->b_data + loff, n, uio);
637                 bqrelse(bp);
638         }
639         return (error);
640 }
641
642 /*
643  * Called with a locked (ip) to do the underlying write to a file or
644  * to build the symlink target.
645  */
646 static
647 int
648 hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
649                    int ioflag, int seqcount)
650 {
651         hammer2_key_t old_eof;
652         struct buf *bp;
653         int kflags;
654         int error;
655
656         /*
657          * Setup if append
658          */
659         if (ioflag & IO_APPEND)
660                 uio->uio_offset = ip->ip_data.size;
661         kflags = 0;
662         error = 0;
663
664         /*
665          * Extend the file if necessary.  If the write fails at some point
666          * we will truncate it back down to cover as much as we were able
667          * to write.
668          *
669          * Doing this now makes it easier to calculate buffer sizes in
670          * the loop.
671          */
672         old_eof = ip->ip_data.size;
673         if (uio->uio_offset + uio->uio_resid > ip->ip_data.size) {
674                 hammer2_extend_file(ip, uio->uio_offset + uio->uio_resid);
675                 kflags |= NOTE_EXTEND;
676         }
677
678         /*
679          * UIO write loop
680          */
681         while (uio->uio_resid > 0) {
682                 hammer2_key_t lbase;
683                 hammer2_key_t leof;
684                 int trivial;
685                 int lblksize;
686                 int loff;
687                 int n;
688
689                 /*
690                  * Don't allow the buffer build to blow out the buffer
691                  * cache.
692                  */
693                 if ((ioflag & IO_RECURSE) == 0) {
694                         /*
695                          * XXX should try to leave this unlocked through
696                          *      the whole loop
697                          */
698                         hammer2_chain_unlock(ip->hmp, &ip->chain);
699                         bwillwrite(HAMMER2_PBUFSIZE);
700                         hammer2_chain_lock(ip->hmp, &ip->chain,
701                                            HAMMER2_RESOLVE_ALWAYS);
702                 }
703
704                 /* XXX bigwrite & signal check test */
705
706                 /*
707                  * This nominally tells us how much we can cluster and
708                  * what the logical buffer size needs to be.  Currently
709                  * we don't try to cluster the write and just handle one
710                  * block at a time.
711                  */
712                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
713                                                 &lbase, &leof);
714                 loff = (int)(uio->uio_offset - lbase);
715
716                 /*
717                  * Calculate bytes to copy this transfer and whether the
718                  * copy completely covers the buffer or not.
719                  */
720                 trivial = 0;
721                 n = lblksize - loff;
722                 if (n > uio->uio_resid) {
723                         n = uio->uio_resid;
724                         if (uio->uio_offset + n == ip->ip_data.size)
725                                 trivial = 1;
726                 } else if (loff == 0) {
727                         trivial = 1;
728                 }
729
730                 /*
731                  * Get the buffer
732                  */
733                 if (uio->uio_segflg == UIO_NOCOPY) {
734                         /*
735                          * Issuing a write with the same data backing the
736                          * buffer.  Instantiate the buffer to collect the
737                          * backing vm pages, then read-in any missing bits.
738                          *
739                          * This case is used by vop_stdputpages().
740                          */
741                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
742                         if ((bp->b_flags & B_CACHE) == 0) {
743                                 bqrelse(bp);
744                                 error = bread(ip->vp, lbase, lblksize, &bp);
745                         }
746                 } else if (trivial) {
747                         /*
748                          * Even though we are entirely overwriting the buffer
749                          * we may still have to zero it out to avoid a
750                          * mmap/write visibility issue.
751                          */
752                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
753                         if ((bp->b_flags & B_CACHE) == 0)
754                                 vfs_bio_clrbuf(bp);
755                 } else {
756                         /*
757                          * Partial overwrite, read in any missing bits then
758                          * replace the portion being written.
759                          *
760                          * (The strategy code will detect zero-fill physical
761                          * blocks for this case).
762                          */
763                         error = bread(ip->vp, lbase, lblksize, &bp);
764                         if (error == 0)
765                                 bheavy(bp);
766                 }
767
768                 if (error) {
769                         brelse(bp);
770                         break;
771                 }
772
773                 /*
774                  * We have to assign physical storage to the buffer we intend
775                  * to dirty or write now to avoid deadlocks in the strategy
776                  * code later.
777                  *
778                  * This can return NOOFFSET for inode-embedded data.  The
779                  * strategy code will take care of it in that case.
780                  */
781                 bp->b_bio2.bio_offset =
782                         hammer2_assign_physical(ip, lbase, lblksize, &error);
783                 if (error) {
784                         brelse(bp);
785                         break;
786                 }
787
788                 /*
789                  * Ok, copy the data in
790                  */
791                 hammer2_chain_unlock(ip->hmp, &ip->chain);
792                 error = uiomove(bp->b_data + loff, n, uio);
793                 hammer2_chain_lock(ip->hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
794                 kflags |= NOTE_WRITE;
795
796                 if (error) {
797                         brelse(bp);
798                         break;
799                 }
800
801                 /* XXX update ino_data.mtime */
802
803                 /*
804                  * Once we dirty a buffer any cached offset becomes invalid.
805                  *
806                  * NOTE: For cluster_write() always use the trailing block
807                  *       size, which is HAMMER2_PBUFSIZE.  lblksize is the
808                  *       eof-straddling blocksize and is incorrect.
809                  */
810                 bp->b_flags |= B_AGE;
811                 if (ioflag & IO_SYNC) {
812                         bwrite(bp);
813                 } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
814                         bp->b_flags |= B_CLUSTEROK;
815                         bdwrite(bp);
816                 } else if (ioflag & IO_ASYNC) {
817                         bawrite(bp);
818                 } else if (hammer2_cluster_enable) {
819                         bp->b_flags |= B_CLUSTEROK;
820                         cluster_write(bp, leof, HAMMER2_PBUFSIZE, seqcount);
821                 } else {
822                         bp->b_flags |= B_CLUSTEROK;
823                         bdwrite(bp);
824                 }
825         }
826
827         /*
828          * Cleanup.  If we extended the file EOF but failed to write through
829          * the entire write is a failure and we have to back-up.
830          */
831         if (error && ip->ip_data.size != old_eof)
832                 hammer2_truncate_file(ip, old_eof);
833         /* hammer2_knote(ip->vp, kflags); */
834         return error;
835 }
836
837 /*
838  * Assign physical storage to a logical block.
839  *
840  * NOOFFSET is returned if the data is inode-embedded.  In this case the
841  * strategy code will simply bcopy() the data into the inode.
842  */
843 static
844 hammer2_off_t
845 hammer2_assign_physical(hammer2_inode_t *ip, hammer2_key_t lbase,
846                         int lblksize, int *errorp)
847 {
848         hammer2_mount_t *hmp;
849         hammer2_chain_t *parent;
850         hammer2_chain_t *chain;
851         hammer2_off_t pbase;
852
853         *errorp = 0;
854         hmp = ip->hmp;
855
856         /*
857          * Locate the chain associated with lbase, return a locked chain.
858          * However, do not instantiate any data reference (which utilizes a
859          * device buffer) because we will be using direct IO via the
860          * logical buffer cache buffer.
861          */
862         parent = &ip->chain;
863         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
864
865         chain = hammer2_chain_lookup(hmp, &parent,
866                                      lbase, lbase,
867                                      HAMMER2_LOOKUP_NODATA);
868
869         if (chain == NULL) {
870                 /*
871                  * We found a hole, create a new chain entry.
872                  *
873                  * NOTE: DATA chains are created without device backing
874                  *       store (nor do we want any).
875                  */
876                 chain = hammer2_chain_create(hmp, parent, NULL,
877                                              lbase, HAMMER2_PBUFRADIX,
878                                              HAMMER2_BREF_TYPE_DATA,
879                                              lblksize);
880                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
881         } else {
882                 switch (chain->bref.type) {
883                 case HAMMER2_BREF_TYPE_INODE:
884                         /*
885                          * The data is embedded in the inode.  The
886                          * caller is responsible for marking the inode
887                          * modified and copying the data to the embedded
888                          * area.
889                          */
890                         pbase = NOOFFSET;
891                         break;
892                 case HAMMER2_BREF_TYPE_DATA:
893                         if (chain->bytes != lblksize) {
894                                 panic("hammer2_assign_physical: "
895                                       "size mismatch %d/%d\n",
896                                       lblksize, chain->bytes);
897                         }
898                         hammer2_chain_modify(hmp, chain,
899                                              HAMMER2_MODIFY_OPTDATA);
900                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
901                         break;
902                 default:
903                         panic("hammer2_assign_physical: bad type");
904                         /* NOT REACHED */
905                         pbase = NOOFFSET;
906                         break;
907                 }
908         }
909
910         if (chain)
911                 hammer2_chain_unlock(hmp, chain);
912         hammer2_chain_unlock(hmp, parent);
913
914         return (pbase);
915 }
916
917 /*
918  * Truncate the size of a file.
919  *
920  * This routine adjusts ip->ip_data.size smaller, destroying any related
921  * data beyond the new EOF and potentially resizing the block straddling
922  * the EOF.
923  *
924  * The inode must be locked.
925  */
926 static
927 void
928 hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
929 {
930         hammer2_chain_t *parent;
931         hammer2_chain_t *chain;
932         hammer2_mount_t *hmp = ip->hmp;
933         hammer2_key_t lbase;
934         hammer2_key_t leof;
935         struct buf *bp;
936         int loff;
937         int error;
938         int oblksize;
939         int nblksize;
940
941         hammer2_chain_modify(hmp, &ip->chain, 0);
942         bp = NULL;
943
944         /*
945          * Destroy any logical buffer cache buffers beyond the file EOF.
946          *
947          * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
948          * around with the buffer straddling EOF, because we need to assign
949          * a new physical offset to it.
950          */
951         if (ip->vp) {
952                 nvtruncbuf(ip->vp, nsize,
953                            HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
954                            1);
955         }
956
957         /*
958          * Setup for lookup/search
959          */
960         parent = &ip->chain;
961         error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
962         if (error) {
963                 hammer2_chain_unlock(hmp, parent);
964                 /* XXX error reporting */
965                 return;
966         }
967
968         /*
969          * Handle the case where a chain/logical-buffer straddles the new
970          * EOF.  We told nvtruncbuf() above not to mess with the logical
971          * buffer straddling the EOF because we need to reassign its storage
972          * and can't let the strategy code do it for us.
973          */
974         loff = (int)nsize & HAMMER2_PBUFMASK;
975         if (loff && ip->vp) {
976                 oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
977                 error = bread(ip->vp, lbase, oblksize, &bp);
978                 KKASSERT(error == 0);
979         }
980         ip->ip_data.size = nsize;
981         nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
982
983         /*
984          * Fixup the chain element.  If we have a logical buffer in-hand
985          * we don't want to create a conflicting device buffer.
986          */
987         if (loff && bp) {
988                 chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
989                                              HAMMER2_LOOKUP_NODATA);
990                 if (chain) {
991                         allocbuf(bp, nblksize);
992                         switch(chain->bref.type) {
993                         case HAMMER2_BREF_TYPE_DATA:
994                                 hammer2_chain_resize(hmp, chain,
995                                              hammer2_bytes_to_radix(nblksize),
996                                              HAMMER2_MODIFY_OPTDATA);
997                                 bzero(bp->b_data + loff, nblksize - loff);
998                                 bp->b_bio2.bio_offset = chain->bref.data_off &
999                                                         HAMMER2_OFF_MASK;
1000                                 break;
1001                         case HAMMER2_BREF_TYPE_INODE:
1002                                 bzero(bp->b_data + loff, nblksize - loff);
1003                                 bp->b_bio2.bio_offset = NOOFFSET;
1004                                 break;
1005                         default:
1006                                 panic("hammer2_truncate_file: bad type");
1007                                 break;
1008                         }
1009                         hammer2_chain_unlock(hmp, chain);
1010                         bp->b_flags |= B_CLUSTEROK;
1011                         bdwrite(bp);
1012                 } else {
1013                         /*
1014                          * Destroy clean buffer w/ wrong buffer size.  Retain
1015                          * backing store.
1016                          */
1017                         bp->b_flags |= B_RELBUF;
1018                         KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
1019                         KKASSERT((bp->b_flags & B_DIRTY) == 0);
1020                         bqrelse(bp);
1021                 }
1022         } else if (loff) {
1023                 /*
1024                  * WARNING: This utilizes a device buffer for the data.
1025                  *
1026                  * XXX case should not occur
1027                  */
1028                 panic("hammer2_truncate_file: non-zero truncation, no-vnode");
1029                 chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase, 0);
1030                 if (chain) {
1031                         switch(chain->bref.type) {
1032                         case HAMMER2_BREF_TYPE_DATA:
1033                                 hammer2_chain_resize(hmp, chain,
1034                                              hammer2_bytes_to_radix(nblksize),
1035                                              0);
1036                                 hammer2_chain_modify(hmp, chain, 0);
1037                                 bzero(chain->data->buf + loff, nblksize - loff);
1038                                 break;
1039                         case HAMMER2_BREF_TYPE_INODE:
1040                                 if (loff < HAMMER2_EMBEDDED_BYTES) {
1041                                         hammer2_chain_modify(hmp, chain, 0);
1042                                         bzero(chain->data->ipdata.u.data + loff,
1043                                               HAMMER2_EMBEDDED_BYTES - loff);
1044                                 }
1045                                 break;
1046                         }
1047                         hammer2_chain_unlock(hmp, chain);
1048                 }
1049         }
1050
1051         /*
1052          * Clean up any fragmentory VM pages now that we have properly
1053          * resized the straddling buffer.  These pages are no longer
1054          * part of the buffer.
1055          */
1056         if (ip->vp) {
1057                 nvtruncbuf(ip->vp, nsize,
1058                            nblksize, (int)nsize & (nblksize - 1),
1059                            1);
1060         }
1061
1062         /*
1063          * Destroy any physical blocks after the new EOF point.
1064          */
1065         lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
1066         chain = hammer2_chain_lookup(hmp, &parent,
1067                                      lbase, (hammer2_key_t)-1,
1068                                      HAMMER2_LOOKUP_NODATA);
1069         while (chain) {
1070                 /*
1071                  * Degenerate embedded data case, nothing to loop on.
1072                  */
1073                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1074                         hammer2_chain_unlock(hmp, chain);
1075                         break;
1076                 }
1077
1078                 /*
1079                  * Delete physical data blocks past the file EOF.
1080                  */
1081                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1082                         hammer2_chain_delete(hmp, parent, chain);
1083                 }
1084                 /* XXX check parent if empty indirect block & delete */
1085                 chain = hammer2_chain_next(hmp, &parent, chain,
1086                                            lbase, (hammer2_key_t)-1,
1087                                            HAMMER2_LOOKUP_NODATA);
1088         }
1089         hammer2_chain_unlock(hmp, parent);
1090 }
1091
1092 /*
1093  * Extend the size of a file.  The inode must be locked.
1094  *
1095  * We may have to resize the block straddling the old EOF.
1096  */
1097 static
1098 void
1099 hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1100 {
1101         hammer2_mount_t *hmp;
1102         hammer2_chain_t *parent;
1103         hammer2_chain_t *chain;
1104         struct buf *bp;
1105         hammer2_key_t osize;
1106         hammer2_key_t obase;
1107         hammer2_key_t nbase;
1108         hammer2_key_t leof;
1109         int oblksize;
1110         int nblksize;
1111         int nradix;
1112         int error;
1113
1114         KKASSERT(ip->vp);
1115         hmp = ip->hmp;
1116
1117         hammer2_chain_modify(hmp, &ip->chain, 0);
1118
1119         /*
1120          * Nothing to do if the direct-data case is still intact
1121          */
1122         if ((ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1123             nsize <= HAMMER2_EMBEDDED_BYTES) {
1124                 ip->ip_data.size = nsize;
1125                 return;
1126         }
1127
1128         /*
1129          * Calculate the blocksize at the original EOF and resize the block
1130          * if necessary.  Adjust the file size in the inode.
1131          */
1132         osize = ip->ip_data.size;
1133         oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
1134         ip->ip_data.size = nsize;
1135         nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
1136
1137         /*
1138          * Do all required vnode operations, but do not mess with the
1139          * buffer straddling the orignal EOF.
1140          */
1141         nvextendbuf(ip->vp,
1142                     ip->ip_data.size, nsize,
1143                     0, nblksize,
1144                     0, (int)nsize & HAMMER2_PBUFMASK,
1145                     1);
1146
1147         /*
1148          * Early return if we have no more work to do.
1149          */
1150         if (obase == nbase && oblksize == nblksize &&
1151             (ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1152                 return;
1153         }
1154
1155         /*
1156          * We have work to do, including possibly resizing the buffer
1157          * at the EOF point and turning off DIRECTDATA mode.
1158          */
1159         bp = NULL;
1160         if (((int)osize & HAMMER2_PBUFMASK)) {
1161                 error = bread(ip->vp, obase, oblksize, &bp);
1162                 KKASSERT(error == 0);
1163
1164                 if (obase != nbase) {
1165                         allocbuf(bp, HAMMER2_PBUFSIZE);
1166                 } else {
1167                         allocbuf(bp, nblksize);
1168                 }
1169                 vfs_bio_clrbuf(bp);
1170         }
1171
1172         /*
1173          * Disable direct-data mode by loading up a buffer cache buffer
1174          * with the data, then converting the inode data area into the
1175          * inode indirect block array area.
1176          */
1177         if (ip->ip_data.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1178                 ip->ip_data.op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1179                 bzero(&ip->ip_data.u.blockset, sizeof(ip->ip_data.u.blockset));
1180         }
1181
1182         /*
1183          * Resize the chain element at the old EOF.
1184          */
1185         if (((int)osize & HAMMER2_PBUFMASK)) {
1186                 parent = &ip->chain;
1187                 error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
1188                 KKASSERT(error == 0);
1189
1190                 nradix = hammer2_bytes_to_radix(nblksize);
1191
1192                 chain = hammer2_chain_lookup(hmp, &parent,
1193                                              obase, obase,
1194                                              HAMMER2_LOOKUP_NODATA);
1195                 if (chain == NULL) {
1196                         chain = hammer2_chain_create(hmp, parent, NULL,
1197                                                      obase, nblksize,
1198                                                      HAMMER2_BREF_TYPE_DATA,
1199                                                      nblksize);
1200                 } else {
1201                         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1202                         hammer2_chain_resize(hmp, chain, nradix,
1203                                              HAMMER2_MODIFY_OPTDATA);
1204                 }
1205                 bp->b_bio2.bio_offset = chain->bref.data_off &
1206                                         HAMMER2_OFF_MASK;
1207                 hammer2_chain_unlock(hmp, chain);
1208                 bp->b_flags |= B_CLUSTEROK;
1209                 bdwrite(bp);
1210                 hammer2_chain_unlock(hmp, parent);
1211         }
1212 }
1213
1214 static
1215 int
1216 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1217 {
1218         hammer2_inode_t *dip;
1219         hammer2_mount_t *hmp;
1220         hammer2_chain_t *parent;
1221         hammer2_chain_t *chain;
1222         struct namecache *ncp;
1223         const uint8_t *name;
1224         size_t name_len;
1225         hammer2_key_t lhc;
1226         int error = 0;
1227         struct vnode *vp;
1228
1229         dip = VTOI(ap->a_dvp);
1230         hmp = dip->hmp;
1231         ncp = ap->a_nch->ncp;
1232         name = ncp->nc_name;
1233         name_len = ncp->nc_nlen;
1234         lhc = hammer2_dirhash(name, name_len);
1235
1236         /*
1237          * Note: In DragonFly the kernel handles '.' and '..'.
1238          */
1239         parent = &dip->chain;
1240         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
1241         chain = hammer2_chain_lookup(hmp, &parent,
1242                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1243                                      0);
1244         while (chain) {
1245                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1246                     chain->u.ip &&
1247                     name_len == chain->data->ipdata.name_len &&
1248                     bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1249                         break;
1250                 }
1251                 chain = hammer2_chain_next(hmp, &parent, chain,
1252                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1253                                            0);
1254         }
1255         hammer2_chain_unlock(hmp, parent);
1256
1257         if (chain) {
1258                 vp = hammer2_igetv(chain->u.ip, &error);
1259                 if (error == 0) {
1260                         vn_unlock(vp);
1261                         cache_setvp(ap->a_nch, vp);
1262                         vrele(vp);
1263                 }
1264                 hammer2_chain_unlock(hmp, chain);
1265         } else {
1266                 error = ENOENT;
1267                 cache_setvp(ap->a_nch, NULL);
1268         }
1269         return error;
1270 }
1271
1272 static
1273 int
1274 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1275 {
1276         hammer2_inode_t *dip;
1277         hammer2_inode_t *ip;
1278         hammer2_mount_t *hmp;
1279         int error;
1280
1281         dip = VTOI(ap->a_dvp);
1282         hmp = dip->hmp;
1283
1284         if ((ip = dip->pip) == NULL) {
1285                 *ap->a_vpp = NULL;
1286                 return ENOENT;
1287         }
1288         hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
1289         *ap->a_vpp = hammer2_igetv(ip, &error);
1290         hammer2_chain_unlock(hmp, &ip->chain);
1291
1292         return error;
1293 }
1294
1295 static
1296 int
1297 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1298 {
1299         hammer2_mount_t *hmp;
1300         hammer2_inode_t *dip;
1301         hammer2_inode_t *nip;
1302         struct namecache *ncp;
1303         const uint8_t *name;
1304         size_t name_len;
1305         int error;
1306
1307         dip = VTOI(ap->a_dvp);
1308         hmp = dip->hmp;
1309         if (hmp->ronly)
1310                 return (EROFS);
1311
1312         ncp = ap->a_nch->ncp;
1313         name = ncp->nc_name;
1314         name_len = ncp->nc_nlen;
1315
1316         error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1317                                      name, name_len, &nip);
1318         if (error) {
1319                 KKASSERT(nip == NULL);
1320                 *ap->a_vpp = NULL;
1321                 return error;
1322         }
1323         *ap->a_vpp = hammer2_igetv(nip, &error);
1324         hammer2_chain_unlock(hmp, &nip->chain);
1325
1326         if (error == 0) {
1327                 cache_setunresolved(ap->a_nch);
1328                 cache_setvp(ap->a_nch, *ap->a_vpp);
1329         }
1330         return error;
1331 }
1332
1333 /*
1334  * Return the largest contiguous physical disk range for the logical
1335  * request.
1336  *
1337  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
1338  */
1339 static
1340 int
1341 hammer2_vop_bmap(struct vop_bmap_args *ap)
1342 {
1343         struct vnode *vp;
1344         hammer2_mount_t *hmp;
1345         hammer2_inode_t *ip;
1346         hammer2_chain_t *parent;
1347         hammer2_chain_t *chain;
1348         hammer2_key_t lbeg;
1349         hammer2_key_t lend;
1350         hammer2_off_t pbeg;
1351         hammer2_off_t pbytes;
1352         hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
1353         int loff;
1354         int ai;
1355
1356         /*
1357          * Only supported on regular files
1358          *
1359          * Only supported for read operations (required for cluster_read).
1360          * The block allocation is delayed for write operations.
1361          */
1362         vp = ap->a_vp;
1363         if (vp->v_type != VREG)
1364                 return (EOPNOTSUPP);
1365         if (ap->a_cmd != BUF_CMD_READ)
1366                 return (EOPNOTSUPP);
1367
1368         ip = VTOI(vp);
1369         hmp = ip->hmp;
1370         bzero(array, sizeof(array));
1371
1372         /*
1373          * Calculate logical range
1374          */
1375         KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
1376         lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
1377         lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
1378         if (lend < lbeg)
1379                 lend = lbeg;
1380         loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
1381
1382         parent = &ip->chain;
1383         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
1384         chain = hammer2_chain_lookup(hmp, &parent,
1385                                      lbeg, lend,
1386                                      HAMMER2_LOOKUP_NODATA);
1387         if (chain == NULL) {
1388                 *ap->a_doffsetp = ZFOFFSET;
1389                 hammer2_chain_unlock(hmp, parent);
1390                 return (0);
1391         }
1392
1393         while (chain) {
1394                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1395                         ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
1396                         KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
1397                         array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
1398                         array[ai][1] = chain->bytes;
1399                 }
1400                 chain = hammer2_chain_next(hmp, &parent, chain,
1401                                            lbeg, lend,
1402                                            HAMMER2_LOOKUP_NODATA);
1403         }
1404         hammer2_chain_unlock(hmp, parent);
1405
1406         /*
1407          * If the requested loffset is not mappable physically we can't
1408          * bmap.  The caller will have to access the file data via a
1409          * device buffer.
1410          */
1411         if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_LBUFSIZE) {
1412                 *ap->a_doffsetp = NOOFFSET;
1413                 return (0);
1414         }
1415
1416         /*
1417          * Calculate the physical disk offset range for array[0]
1418          */
1419         pbeg = array[0][0] + loff;
1420         pbytes = array[0][1] - loff;
1421
1422         for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
1423                 if (array[ai][0] != pbeg + pbytes)
1424                         break;
1425                 pbytes += array[ai][1];
1426         }
1427
1428         *ap->a_doffsetp = pbeg;
1429         if (ap->a_runp)
1430                 *ap->a_runp = pbytes;
1431         return (0);
1432 }
1433
1434 static
1435 int
1436 hammer2_vop_open(struct vop_open_args *ap)
1437 {
1438         return vop_stdopen(ap);
1439 }
1440
1441 /*
1442  * hammer2_vop_advlock { vp, id, op, fl, flags }
1443  */
1444 static
1445 int
1446 hammer2_vop_advlock(struct vop_advlock_args *ap)
1447 {
1448         hammer2_inode_t *ip = VTOI(ap->a_vp);
1449
1450         return (lf_advlock(ap, &ip->advlock, ip->ip_data.size));
1451 }
1452
1453
1454 static
1455 int
1456 hammer2_vop_close(struct vop_close_args *ap)
1457 {
1458         return vop_stdclose(ap);
1459 }
1460
1461 /*
1462  * hammer2_vop_nlink { nch, dvp, vp, cred }
1463  *
1464  * Create a hardlink to vp.
1465  */
1466 static
1467 int
1468 hammer2_vop_nlink(struct vop_nlink_args *ap)
1469 {
1470         hammer2_inode_t *dip;
1471         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1472         hammer2_mount_t *hmp;
1473         struct namecache *ncp;
1474         const uint8_t *name;
1475         size_t name_len;
1476         int error;
1477
1478         dip = VTOI(ap->a_dvp);
1479         hmp = dip->hmp;
1480         if (hmp->ronly)
1481                 return (EROFS);
1482
1483         ip = VTOI(ap->a_vp);
1484
1485         ncp = ap->a_nch->ncp;
1486         name = ncp->nc_name;
1487         name_len = ncp->nc_nlen;
1488
1489         error = hammer2_hardlink_create(ip, dip, name, name_len);
1490         if (error == 0) {
1491                 cache_setunresolved(ap->a_nch);
1492                 cache_setvp(ap->a_nch, ap->a_vp);
1493         }
1494         return error;
1495 }
1496
1497 /*
1498  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1499  *
1500  * The operating system has already ensured that the directory entry
1501  * does not exist and done all appropriate namespace locking.
1502  */
1503 static
1504 int
1505 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1506 {
1507         hammer2_mount_t *hmp;
1508         hammer2_inode_t *dip;
1509         hammer2_inode_t *nip;
1510         struct namecache *ncp;
1511         const uint8_t *name;
1512         size_t name_len;
1513         int error;
1514
1515         dip = VTOI(ap->a_dvp);
1516         hmp = dip->hmp;
1517         if (hmp->ronly)
1518                 return (EROFS);
1519
1520         ncp = ap->a_nch->ncp;
1521         name = ncp->nc_name;
1522         name_len = ncp->nc_nlen;
1523
1524         error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1525                                      name, name_len, &nip);
1526         if (error) {
1527                 KKASSERT(nip == NULL);
1528                 *ap->a_vpp = NULL;
1529                 return error;
1530         }
1531         *ap->a_vpp = hammer2_igetv(nip, &error);
1532         hammer2_chain_unlock(hmp, &nip->chain);
1533
1534         if (error == 0) {
1535                 cache_setunresolved(ap->a_nch);
1536                 cache_setvp(ap->a_nch, *ap->a_vpp);
1537         }
1538         return error;
1539 }
1540
1541 /*
1542  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1543  */
1544 static
1545 int
1546 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1547 {
1548         hammer2_mount_t *hmp;
1549         hammer2_inode_t *dip;
1550         hammer2_inode_t *nip;
1551         struct namecache *ncp;
1552         const uint8_t *name;
1553         size_t name_len;
1554         int error;
1555
1556         dip = VTOI(ap->a_dvp);
1557         hmp = dip->hmp;
1558         if (hmp->ronly)
1559                 return (EROFS);
1560
1561         ncp = ap->a_nch->ncp;
1562         name = ncp->nc_name;
1563         name_len = ncp->nc_nlen;
1564
1565         ap->a_vap->va_type = VLNK;      /* enforce type */
1566
1567         error = hammer2_inode_create(dip, ap->a_vap, ap->a_cred,
1568                                      name, name_len, &nip);
1569         if (error) {
1570                 KKASSERT(nip == NULL);
1571                 *ap->a_vpp = NULL;
1572                 return error;
1573         }
1574         *ap->a_vpp = hammer2_igetv(nip, &error);
1575
1576         /*
1577          * Build the softlink (~like file data) and finalize the namecache.
1578          */
1579         if (error == 0) {
1580                 size_t bytes;
1581                 struct uio auio;
1582                 struct iovec aiov;
1583
1584                 bytes = strlen(ap->a_target);
1585
1586                 if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1587                         KKASSERT(nip->ip_data.op_flags &
1588                                  HAMMER2_OPFLAG_DIRECTDATA);
1589                         bcopy(ap->a_target, nip->ip_data.u.data, bytes);
1590                         nip->ip_data.size = bytes;
1591                 } else {
1592                         bzero(&auio, sizeof(auio));
1593                         bzero(&aiov, sizeof(aiov));
1594                         auio.uio_iov = &aiov;
1595                         auio.uio_segflg = UIO_SYSSPACE;
1596                         auio.uio_rw = UIO_WRITE;
1597                         auio.uio_resid = bytes;
1598                         auio.uio_iovcnt = 1;
1599                         auio.uio_td = curthread;
1600                         aiov.iov_base = ap->a_target;
1601                         aiov.iov_len = bytes;
1602                         error = hammer2_write_file(nip, &auio, IO_APPEND, 0);
1603                         /* XXX handle error */
1604                         error = 0;
1605                 }
1606         }
1607         hammer2_chain_unlock(hmp, &nip->chain);
1608
1609         /*
1610          * Finalize namecache
1611          */
1612         if (error == 0) {
1613                 cache_setunresolved(ap->a_nch);
1614                 cache_setvp(ap->a_nch, *ap->a_vpp);
1615                 /* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
1616         }
1617         return error;
1618 }
1619
1620 /*
1621  * hammer2_vop_nremove { nch, dvp, cred }
1622  */
1623 static
1624 int
1625 hammer2_vop_nremove(struct vop_nremove_args *ap)
1626 {
1627         hammer2_inode_t *dip;
1628         hammer2_mount_t *hmp;
1629         struct namecache *ncp;
1630         const uint8_t *name;
1631         size_t name_len;
1632         int error;
1633
1634         dip = VTOI(ap->a_dvp);
1635         hmp = dip->hmp;
1636         if (hmp->ronly)
1637                 return(EROFS);
1638
1639         ncp = ap->a_nch->ncp;
1640         name = ncp->nc_name;
1641         name_len = ncp->nc_nlen;
1642
1643         error = hammer2_unlink_file(dip, name, name_len, 0, 1);
1644
1645         if (error == 0) {
1646                 cache_setunresolved(ap->a_nch);
1647                 cache_setvp(ap->a_nch, NULL);
1648         }
1649         return (error);
1650 }
1651
1652 /*
1653  * hammer2_vop_nrmdir { nch, dvp, cred }
1654  */
1655 static
1656 int
1657 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
1658 {
1659         hammer2_inode_t *dip;
1660         hammer2_mount_t *hmp;
1661         struct namecache *ncp;
1662         const uint8_t *name;
1663         size_t name_len;
1664         int error;
1665
1666         dip = VTOI(ap->a_dvp);
1667         hmp = dip->hmp;
1668         if (hmp->ronly)
1669                 return(EROFS);
1670
1671         ncp = ap->a_nch->ncp;
1672         name = ncp->nc_name;
1673         name_len = ncp->nc_nlen;
1674
1675         error = hammer2_unlink_file(dip, name, name_len, 1, 1);
1676
1677         if (error == 0) {
1678                 cache_setunresolved(ap->a_nch);
1679                 cache_setvp(ap->a_nch, NULL);
1680         }
1681         return (error);
1682 }
1683
1684 /*
1685  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1686  */
1687 static
1688 int
1689 hammer2_vop_nrename(struct vop_nrename_args *ap)
1690 {
1691         struct namecache *fncp;
1692         struct namecache *tncp;
1693         hammer2_inode_t *fdip;
1694         hammer2_inode_t *tdip;
1695         hammer2_inode_t *ip;
1696         hammer2_mount_t *hmp;
1697         const uint8_t *fname;
1698         size_t fname_len;
1699         const uint8_t *tname;
1700         size_t tname_len;
1701         int error;
1702
1703         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1704                 return(EXDEV);
1705         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1706                 return(EXDEV);
1707
1708         fdip = VTOI(ap->a_fdvp);        /* source directory */
1709         tdip = VTOI(ap->a_tdvp);        /* target directory */
1710
1711         hmp = fdip->hmp;                /* check read-only filesystem */
1712         if (hmp->ronly)
1713                 return(EROFS);
1714
1715         fncp = ap->a_fnch->ncp;         /* entry name in source */
1716         fname = fncp->nc_name;
1717         fname_len = fncp->nc_nlen;
1718
1719         tncp = ap->a_tnch->ncp;         /* entry name in target */
1720         tname = tncp->nc_name;
1721         tname_len = tncp->nc_nlen;
1722
1723         ip = VTOI(fncp->nc_vp);         /* inode being moved */
1724
1725         /*
1726          * Keep a tight grip on the inode as removing it should disconnect
1727          * it and we don't want to destroy it.
1728          *
1729          * NOTE: To avoid deadlocks we cannot lock (ip) while we are
1730          *       unlinking elements from their directories.
1731          */
1732         hammer2_chain_ref(hmp, &ip->chain);
1733
1734         /*
1735          * Remove target if it exists
1736          */
1737         error = hammer2_unlink_file(tdip, tname, tname_len, -1, 1);
1738         if (error && error != ENOENT)
1739                 goto done;
1740         cache_setunresolved(ap->a_tnch);
1741         cache_setvp(ap->a_tnch, NULL);
1742
1743         /*
1744          * Disconnect ip from the source directory, do not adjust
1745          * the link count.  Note that rename doesn't need to understand
1746          * whether this is a hardlink or not, we can just rename the
1747          * forwarding entry and don't even have to adjust the related
1748          * hardlink's link count.
1749          */
1750         error = hammer2_unlink_file(fdip, fname, fname_len, -1, 0);
1751         if (error)
1752                 goto done;
1753
1754         if (ip->chain.parent != NULL)
1755                 panic("hammer2_vop_nrename(): rename source != ip!");
1756
1757         /*
1758          * Reconnect ip to target directory.
1759          *
1760          * WARNING: chain locks can lock buffer cache buffers, to avoid
1761          *          deadlocks we want to unlock before issuing a cache_*()
1762          *          op (that might have to lock a vnode).
1763          */
1764         hammer2_chain_lock(hmp, &ip->chain, HAMMER2_RESOLVE_ALWAYS);
1765         error = hammer2_inode_connect(tdip, ip, tname, tname_len);
1766         hammer2_chain_unlock(hmp, &ip->chain);
1767
1768         if (error == 0) {
1769                 cache_rename(ap->a_fnch, ap->a_tnch);
1770         }
1771 done:
1772         hammer2_chain_drop(hmp, &ip->chain);    /* from ref up top */
1773
1774         return (error);
1775 }
1776
1777 static int hammer2_strategy_read(struct vop_strategy_args *ap);
1778 static int hammer2_strategy_write(struct vop_strategy_args *ap);
1779
1780 static
1781 int
1782 hammer2_vop_strategy(struct vop_strategy_args *ap)
1783 {
1784         struct bio *biop;
1785         struct buf *bp;
1786         int error;
1787
1788         biop = ap->a_bio;
1789         bp = biop->bio_buf;
1790
1791         switch(bp->b_cmd) {
1792         case BUF_CMD_READ:
1793                 error = hammer2_strategy_read(ap);
1794                 ++hammer2_iod_file_read;
1795                 break;
1796         case BUF_CMD_WRITE:
1797                 error = hammer2_strategy_write(ap);
1798                 ++hammer2_iod_file_write;
1799                 break;
1800         default:
1801                 bp->b_error = error = EINVAL;
1802                 bp->b_flags |= B_ERROR;
1803                 biodone(biop);
1804                 break;
1805         }
1806
1807         return (error);
1808 }
1809
1810 static
1811 int
1812 hammer2_strategy_read(struct vop_strategy_args *ap)
1813 {
1814         struct buf *bp;
1815         struct bio *bio;
1816         struct bio *nbio;
1817         hammer2_mount_t *hmp;
1818         hammer2_inode_t *ip;
1819         hammer2_chain_t *parent;
1820         hammer2_chain_t *chain;
1821         hammer2_key_t lbase;
1822
1823         bio = ap->a_bio;
1824         bp = bio->bio_buf;
1825         ip = VTOI(ap->a_vp);
1826         hmp = ip->hmp;
1827         nbio = push_bio(bio);
1828
1829         lbase = bio->bio_offset;
1830         chain = NULL;
1831         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
1832
1833         /*
1834          * We must characterize the logical->physical translation if it
1835          * has not already been cached.
1836          *
1837          * Physical data references < LBUFSIZE are never cached.  This
1838          * includes both small-block allocations and inode-embedded data.
1839          */
1840         if (nbio->bio_offset == NOOFFSET) {
1841                 parent = &ip->chain;
1842                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
1843
1844                 chain = hammer2_chain_lookup(hmp, &parent, lbase, lbase,
1845                                              HAMMER2_LOOKUP_NODATA);
1846                 if (chain == NULL) {
1847                         /*
1848                          * Data is zero-fill
1849                          */
1850                         nbio->bio_offset = ZFOFFSET;
1851                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1852                         /*
1853                          * Data is embedded in the inode (do nothing)
1854                          */
1855                         KKASSERT(chain == parent);
1856                         hammer2_chain_unlock(hmp, chain);
1857                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1858                         /*
1859                          * Data is on-media
1860                          */
1861                         KKASSERT(bp->b_bcount == chain->bytes);
1862                         nbio->bio_offset = chain->bref.data_off &
1863                                            HAMMER2_OFF_MASK;
1864                         hammer2_chain_unlock(hmp, chain);
1865                         KKASSERT(nbio->bio_offset != 0);
1866                 } else {
1867                         panic("hammer2_strategy_read: unknown bref type");
1868                 }
1869                 hammer2_chain_unlock(hmp, parent);
1870         }
1871
1872         if (hammer2_debug & 0x0020) {
1873                 kprintf("read %016jx %016jx\n",
1874                         bio->bio_offset, nbio->bio_offset);
1875         }
1876
1877         if (nbio->bio_offset == ZFOFFSET) {
1878                 /*
1879                  * Data is zero-fill
1880                  */
1881                 bp->b_resid = 0;
1882                 bp->b_error = 0;
1883                 bzero(bp->b_data, bp->b_bcount);
1884                 biodone(nbio);
1885         } else if (nbio->bio_offset != NOOFFSET) {
1886                 /*
1887                  * Forward direct IO to the device
1888                  */
1889                 vn_strategy(hmp->devvp, nbio);
1890         } else {
1891                 /*
1892                  * Data is embedded in inode.
1893                  */
1894                 bcopy(chain->data->ipdata.u.data, bp->b_data,
1895                       HAMMER2_EMBEDDED_BYTES);
1896                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
1897                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
1898                 bp->b_resid = 0;
1899                 bp->b_error = 0;
1900                 biodone(nbio);
1901         }
1902         return (0);
1903 }
1904
1905 static
1906 int
1907 hammer2_strategy_write(struct vop_strategy_args *ap)
1908 {
1909         struct buf *bp;
1910         struct bio *bio;
1911         struct bio *nbio;
1912         hammer2_mount_t *hmp;
1913         hammer2_inode_t *ip;
1914
1915         bio = ap->a_bio;
1916         bp = bio->bio_buf;
1917         ip = VTOI(ap->a_vp);
1918         hmp = ip->hmp;
1919         nbio = push_bio(bio);
1920
1921         KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
1922         KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
1923
1924         if (nbio->bio_offset == NOOFFSET) {
1925                 /*
1926                  * Must be embedded in the inode.
1927                  */
1928                 KKASSERT(bio->bio_offset == 0);
1929                 bcopy(bp->b_data, ip->ip_data.u.data, HAMMER2_EMBEDDED_BYTES);
1930                 bp->b_resid = 0;
1931                 bp->b_error = 0;
1932                 biodone(nbio);
1933
1934                 /*
1935                  * This special flag does not follow the normal MODIFY rules
1936                  * because we might deadlock on ip.  Instead we depend on
1937                  * VOP_FSYNC() to detect the case.
1938                  */
1939                 atomic_set_int(&ip->chain.flags, HAMMER2_CHAIN_DIRTYEMBED);
1940         } else {
1941                 /*
1942                  * Forward direct IO to the device
1943                  */
1944                 vn_strategy(hmp->devvp, nbio);
1945         }
1946         return (0);
1947 }
1948
1949 /*
1950  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
1951  */
1952 static
1953 int
1954 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
1955 {
1956         hammer2_mount_t *hmp;
1957         hammer2_inode_t *ip;
1958         int error;
1959
1960         ip = VTOI(ap->a_vp);
1961         hmp = ip->hmp;
1962
1963         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
1964                               ap->a_fflag, ap->a_cred);
1965         return (error);
1966 }
1967
1968 static
1969 int 
1970 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
1971 {
1972         struct mount *mp;
1973         hammer2_pfsmount_t *pmp;
1974         int rc;
1975
1976         switch (ap->a_op) {
1977         case (MOUNTCTL_SET_EXPORT):
1978                 mp = ap->a_head.a_ops->head.vv_mount;
1979                 pmp = MPTOPMP(mp);
1980
1981                 if (ap->a_ctllen != sizeof(struct export_args))
1982                         rc = (EINVAL);
1983                 else
1984                         rc = vfs_export(mp, &pmp->export,
1985                                         (const struct export_args *)ap->a_ctl);
1986                 break;
1987         default:
1988                 rc = vop_stdmountctl(ap);
1989                 break;
1990         }
1991         return (rc);
1992 }
1993
1994 struct vop_ops hammer2_vnode_vops = {
1995         .vop_default    = vop_defaultop,
1996         .vop_fsync      = hammer2_vop_fsync,
1997         .vop_getpages   = vop_stdgetpages,
1998         .vop_putpages   = vop_stdputpages,
1999         .vop_access     = hammer2_vop_access,
2000         .vop_advlock    = hammer2_vop_advlock,
2001         .vop_close      = hammer2_vop_close,
2002         .vop_nlink      = hammer2_vop_nlink,
2003         .vop_ncreate    = hammer2_vop_ncreate,
2004         .vop_nsymlink   = hammer2_vop_nsymlink,
2005         .vop_nremove    = hammer2_vop_nremove,
2006         .vop_nrmdir     = hammer2_vop_nrmdir,
2007         .vop_nrename    = hammer2_vop_nrename,
2008         .vop_getattr    = hammer2_vop_getattr,
2009         .vop_setattr    = hammer2_vop_setattr,
2010         .vop_readdir    = hammer2_vop_readdir,
2011         .vop_readlink   = hammer2_vop_readlink,
2012         .vop_getpages   = vop_stdgetpages,
2013         .vop_putpages   = vop_stdputpages,
2014         .vop_read       = hammer2_vop_read,
2015         .vop_write      = hammer2_vop_write,
2016         .vop_open       = hammer2_vop_open,
2017         .vop_inactive   = hammer2_vop_inactive,
2018         .vop_reclaim    = hammer2_vop_reclaim,
2019         .vop_nresolve   = hammer2_vop_nresolve,
2020         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2021         .vop_nmkdir     = hammer2_vop_nmkdir,
2022         .vop_ioctl      = hammer2_vop_ioctl,
2023         .vop_mountctl   = hammer2_vop_mountctl,
2024         .vop_bmap       = hammer2_vop_bmap,
2025         .vop_strategy   = hammer2_vop_strategy,
2026 };
2027
2028 struct vop_ops hammer2_spec_vops = {
2029
2030 };
2031
2032 struct vop_ops hammer2_fifo_vops = {
2033
2034 };