hammer2 - Major restructuring, part 4/several
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * Kernel Filesystem interface
37  *
38  * NOTE! local ipdata pointers must be reloaded on any modifying operation
39  *       to the inode as its underlying chain may have changed.
40  */
41
42 #include <sys/param.h>
43 #include <sys/systm.h>
44 #include <sys/kernel.h>
45 #include <sys/fcntl.h>
46 #include <sys/buf.h>
47 #include <sys/proc.h>
48 #include <sys/namei.h>
49 #include <sys/mount.h>
50 #include <sys/vnode.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include "hammer2.h"
56
57 #define ZFOFFSET        (-2LL)
58
59 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
60                                 int seqcount);
61 static int hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
62                                 struct uio *uio, int ioflag, int seqcount);
63 static hammer2_off_t hammer2_assign_physical(hammer2_trans_t *trans,
64                                 hammer2_inode_t *ip,
65                                 hammer2_key_t lbase, int lblksize,
66                                 int *errorp);
67 static void hammer2_extend_file(hammer2_trans_t *trans,
68                                 hammer2_inode_t *ip, hammer2_key_t nsize);
69 static void hammer2_truncate_file(hammer2_trans_t *trans,
70                                 hammer2_inode_t *ip, hammer2_key_t nsize);
71
72 static __inline
73 void
74 hammer2_knote(struct vnode *vp, int flags)
75 {
76         if (flags)
77                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
78 }
79
80 /*
81  * Last reference to a vnode is going away but it is still cached.
82  */
83 static
84 int
85 hammer2_vop_inactive(struct vop_inactive_args *ap)
86 {
87         hammer2_inode_t *ip;
88         hammer2_trans_t trans;
89         struct vnode *vp;
90 #if 0
91         struct hammer2_mount *hmp;
92 #endif
93
94         vp = ap->a_vp;
95         ip = VTOI(vp);
96
97         /*
98          * Degenerate case
99          */
100         if (ip == NULL) {
101                 vrecycle(vp);
102                 return (0);
103         }
104
105         /*
106          * Detect updates to the embedded data which may be synchronized by
107          * the strategy code.  Simply mark the inode modified so it gets
108          * picked up by our normal flush.
109          */
110         hammer2_inode_lock_ex(ip);
111         KKASSERT(ip->chain);
112         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
113                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
114                 hammer2_trans_init(&trans, ip->hmp);
115                 hammer2_chain_modify(&trans, ip->chain, 0);
116                 hammer2_trans_done(&trans);
117         }
118
119         /*
120          * Check for deleted inodes and recycle immediately.
121          */
122         if (ip->chain->flags & HAMMER2_CHAIN_DELETED) {
123                 hammer2_inode_unlock_ex(ip);
124                 vrecycle(vp);
125         } else {
126                 hammer2_inode_unlock_ex(ip);
127         }
128         return (0);
129 }
130
131 /*
132  * Reclaim a vnode so that it can be reused; after the inode is
133  * disassociated, the filesystem must manage it alone.
134  */
135 static
136 int
137 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
138 {
139         hammer2_chain_t *chain;
140         hammer2_inode_t *ip;
141         hammer2_mount_t *hmp;
142         hammer2_trans_t trans;
143         struct vnode *vp;
144
145         vp = ap->a_vp;
146         ip = VTOI(vp);
147         if (ip == NULL)
148                 return(0);
149         hmp = ip->hmp;
150
151         /*
152          * Set SUBMODIFIED so we can detect and propagate the DESTROYED
153          * bit in the flush code.
154          *
155          * ip->chain might be stale, correct it before checking as older
156          * versions of the chain are likely marked deleted even if the
157          * file hasn't been.  XXX ip->chain should never be stale on
158          * reclaim.
159          */
160         hammer2_inode_lock_ex(ip);
161         chain = ip->chain;
162         if (chain->duplink)
163                 kprintf("RECLAIM DUPLINKED IP: %p %p\n", ip, ip->chain);
164 #if 0
165         while (chain->duplink)
166                 chain = chain->duplink;
167         if (ip->chain != chain) {
168                 hammer2_inode_repoint(ip, ip->pip, chain);
169                 chain = ip->chain;
170         }
171 #endif
172
173         /*
174          * The final close of a deleted file or directory marks it for
175          * destruction.  The DESTROYED flag allows the flusher to shortcut
176          * any modified blocks still unflushed (that is, just ignore them).
177          *
178          * HAMMER2 usually does not try to optimize the freemap by returning
179          * deleted blocks to it as it does not usually know how many snapshots
180          * might be referencing portions of the file/dir.  XXX TODO.
181          *
182          * XXX TODO - However, any modified file as-of when a snapshot is made
183          *            cannot use this optimization as some of the modifications
184          *            may wind up being part of the snapshot.
185          */
186         vp->v_data = NULL;
187         ip->vp = NULL;
188         if (chain->flags & HAMMER2_CHAIN_DELETED) {
189                 KKASSERT(chain->flags & HAMMER2_CHAIN_DELETED);
190                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DESTROYED |
191                                               HAMMER2_CHAIN_SUBMODIFIED);
192         }
193         if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
194                             HAMMER2_CHAIN_DELETED |
195                             HAMMER2_CHAIN_SUBMODIFIED)) {
196                 hammer2_trans_init(&trans, ip->hmp);
197                 hammer2_chain_flush(&trans, chain);
198                 hammer2_trans_done(&trans);
199         }
200         if (ip->refs > 2)                           /* (our lock + vp ref) */
201                 hammer2_inode_unlock_ex(ip);        /* unlock */
202         else
203                 hammer2_inode_put(ip);              /* unlock & disconnect */
204         /* chain no longer referenced */
205         /* chain = NULL; not needed */
206         hammer2_inode_drop(ip);                     /* vp ref */
207
208         /*
209          * XXX handle background sync when ip dirty, kernel will no longer
210          * notify us regarding this inode because there is no longer a
211          * vnode attached to it.
212          */
213
214         return (0);
215 }
216
217 static
218 int
219 hammer2_vop_fsync(struct vop_fsync_args *ap)
220 {
221         hammer2_inode_t *ip;
222         hammer2_trans_t trans;
223         struct vnode *vp;
224
225         vp = ap->a_vp;
226         ip = VTOI(vp);
227
228         hammer2_trans_init(&trans, ip->hmp);
229         hammer2_inode_lock_ex(ip);
230
231         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
232
233         /*
234          * Detect updates to the embedded data which may be synchronized by
235          * the strategy code.  Simply mark the inode modified so it gets
236          * picked up by our normal flush.
237          */
238         if (ip->flags & HAMMER2_INODE_DIRTYEMBED) {
239                 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
240                 hammer2_chain_modify(&trans, ip->chain, 0);
241         }
242
243         /*
244          * Calling chain_flush here creates a lot of duplicative
245          * COW operations due to non-optimal vnode ordering.
246          *
247          * Only do it for an actual fsync() syscall.  The other forms
248          * which call this function will eventually call chain_flush
249          * on the volume root as a catch-all, which is far more optimal.
250          */
251         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
252         if (ap->a_flags & VOP_FSYNC_SYSCALL)
253                 hammer2_chain_flush(&trans, ip->chain);
254         hammer2_inode_unlock_ex(ip);
255         hammer2_trans_done(&trans);
256         return (0);
257 }
258
259 static
260 int
261 hammer2_vop_access(struct vop_access_args *ap)
262 {
263         hammer2_inode_t *ip = VTOI(ap->a_vp);
264         hammer2_inode_data_t *ipdata;
265         uid_t uid;
266         gid_t gid;
267         int error;
268
269         hammer2_inode_lock_sh(ip);
270         ipdata = &ip->chain->data->ipdata;
271         uid = hammer2_to_unix_xid(&ipdata->uid);
272         gid = hammer2_to_unix_xid(&ipdata->gid);
273         error = vop_helper_access(ap, uid, gid, ipdata->mode, ipdata->uflags);
274         hammer2_inode_unlock_sh(ip);
275
276         return (error);
277 }
278
279 static
280 int
281 hammer2_vop_getattr(struct vop_getattr_args *ap)
282 {
283         hammer2_inode_data_t *ipdata;
284         hammer2_pfsmount_t *pmp;
285         hammer2_inode_t *ip;
286         struct vnode *vp;
287         struct vattr *vap;
288
289         vp = ap->a_vp;
290         vap = ap->a_vap;
291
292         ip = VTOI(vp);
293         pmp = ip->pmp;
294
295         hammer2_inode_lock_sh(ip);
296         ipdata = &ip->chain->data->ipdata;
297
298         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
299         vap->va_fileid = ipdata->inum;
300         vap->va_mode = ipdata->mode;
301         vap->va_nlink = ipdata->nlinks;
302         vap->va_uid = hammer2_to_unix_xid(&ipdata->uid);
303         vap->va_gid = hammer2_to_unix_xid(&ipdata->gid);
304         vap->va_rmajor = 0;
305         vap->va_rminor = 0;
306         vap->va_size = ipdata->size;
307         vap->va_blocksize = HAMMER2_PBUFSIZE;
308         vap->va_flags = ipdata->uflags;
309         hammer2_time_to_timespec(ipdata->ctime, &vap->va_ctime);
310         hammer2_time_to_timespec(ipdata->mtime, &vap->va_mtime);
311         hammer2_time_to_timespec(ipdata->mtime, &vap->va_atime);
312         vap->va_gen = 1;
313         vap->va_bytes = vap->va_size;   /* XXX */
314         vap->va_type = hammer2_get_vtype(ip->chain);
315         vap->va_filerev = 0;
316         vap->va_uid_uuid = ipdata->uid;
317         vap->va_gid_uuid = ipdata->gid;
318         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
319                           VA_FSID_UUID_VALID;
320
321         hammer2_inode_unlock_sh(ip);
322
323         return (0);
324 }
325
326 static
327 int
328 hammer2_vop_setattr(struct vop_setattr_args *ap)
329 {
330         hammer2_inode_data_t *ipdata;
331         hammer2_inode_t *ip;
332         hammer2_mount_t *hmp;
333         hammer2_trans_t trans;
334         struct vnode *vp;
335         struct vattr *vap;
336         int error;
337         int kflags = 0;
338         int domtime = 0;
339         uint64_t ctime;
340
341         vp = ap->a_vp;
342         vap = ap->a_vap;
343         hammer2_update_time(&ctime);
344
345         ip = VTOI(vp);
346         hmp = ip->hmp;
347
348         if (hmp->ronly)
349                 return(EROFS);
350
351         hammer2_trans_init(&trans, hmp);
352         hammer2_inode_lock_ex(ip);
353         ipdata = &ip->chain->data->ipdata;
354         error = 0;
355
356         if (vap->va_flags != VNOVAL) {
357                 u_int32_t flags;
358
359                 flags = ipdata->uflags;
360                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
361                                          hammer2_to_unix_xid(&ipdata->uid),
362                                          ap->a_cred);
363                 if (error == 0) {
364                         if (ipdata->uflags != flags) {
365                                 hammer2_chain_modify(&trans, ip->chain, 0);
366                                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
367                                 ipdata->uflags = flags;
368                                 ipdata->ctime = ctime;
369                                 kflags |= NOTE_ATTRIB;
370                         }
371                         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
372                                 error = 0;
373                                 goto done;
374                         }
375                 }
376                 goto done;
377         }
378         if (ipdata->uflags & (IMMUTABLE | APPEND)) {
379                 error = EPERM;
380                 goto done;
381         }
382         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
383                 mode_t cur_mode = ipdata->mode;
384                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
385                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
386                 uuid_t uuid_uid;
387                 uuid_t uuid_gid;
388
389                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
390                                          ap->a_cred,
391                                          &cur_uid, &cur_gid, &cur_mode);
392                 if (error == 0) {
393                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
394                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
395                         if (bcmp(&uuid_uid, &ipdata->uid, sizeof(uuid_uid)) ||
396                             bcmp(&uuid_gid, &ipdata->gid, sizeof(uuid_gid)) ||
397                             ipdata->mode != cur_mode
398                         ) {
399                                 hammer2_chain_modify(&trans, ip->chain, 0);
400                                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
401                                 ipdata->uid = uuid_uid;
402                                 ipdata->gid = uuid_gid;
403                                 ipdata->mode = cur_mode;
404                                 ipdata->ctime = ctime;
405                         }
406                         kflags |= NOTE_ATTRIB;
407                 }
408         }
409
410         /*
411          * Resize the file
412          */
413         if (vap->va_size != VNOVAL && ipdata->size != vap->va_size) {
414                 switch(vp->v_type) {
415                 case VREG:
416                         if (vap->va_size == ipdata->size)
417                                 break;
418                         if (vap->va_size < ipdata->size) {
419                                 hammer2_truncate_file(&trans, ip, vap->va_size);
420                         } else {
421                                 hammer2_extend_file(&trans, ip, vap->va_size);
422                         }
423                         ipdata = &ip->chain->data->ipdata; /* RELOAD */
424                         domtime = 1;
425                         break;
426                 default:
427                         error = EINVAL;
428                         goto done;
429                 }
430         }
431 #if 0
432         /* atime not supported */
433         if (vap->va_atime.tv_sec != VNOVAL) {
434                 hammer2_chain_modify(&trans, ip->chain, 0);
435                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
436                 ipdata->atime = hammer2_timespec_to_time(&vap->va_atime);
437                 kflags |= NOTE_ATTRIB;
438         }
439 #endif
440         if (vap->va_mtime.tv_sec != VNOVAL) {
441                 hammer2_chain_modify(&trans, ip->chain, 0);
442                 ipdata = &ip->chain->data->ipdata; /* RELOAD */
443                 ipdata->mtime = hammer2_timespec_to_time(&vap->va_mtime);
444                 kflags |= NOTE_ATTRIB;
445         }
446         if (vap->va_mode != (mode_t)VNOVAL) {
447                 mode_t cur_mode = ipdata->mode;
448                 uid_t cur_uid = hammer2_to_unix_xid(&ipdata->uid);
449                 gid_t cur_gid = hammer2_to_unix_xid(&ipdata->gid);
450
451                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
452                                          cur_uid, cur_gid, &cur_mode);
453                 if (error == 0 && ipdata->mode != cur_mode) {
454                         hammer2_chain_modify(&trans, ip->chain, 0);
455                         ipdata = &ip->chain->data->ipdata; /* RELOAD */
456                         ipdata->mode = cur_mode;
457                         ipdata->ctime = ctime;
458                         kflags |= NOTE_ATTRIB;
459                 }
460         }
461 done:
462         hammer2_inode_unlock_ex(ip);
463         hammer2_trans_done(&trans);
464         return (error);
465 }
466
467 static
468 int
469 hammer2_vop_readdir(struct vop_readdir_args *ap)
470 {
471         hammer2_inode_data_t *ipdata;
472         hammer2_mount_t *hmp;
473         hammer2_inode_t *ip;
474         hammer2_inode_t *xip;
475         hammer2_chain_t *parent;
476         hammer2_chain_t *chain;
477         hammer2_tid_t inum;
478         hammer2_key_t lkey;
479         struct uio *uio;
480         off_t *cookies;
481         off_t saveoff;
482         int cookie_index;
483         int ncookies;
484         int error;
485         int dtype;
486         int r;
487
488         ip = VTOI(ap->a_vp);
489         hmp = ip->hmp;
490         uio = ap->a_uio;
491         saveoff = uio->uio_offset;
492
493         /*
494          * Setup cookies directory entry cookies if requested
495          */
496         if (ap->a_ncookies) {
497                 ncookies = uio->uio_resid / 16 + 1;
498                 if (ncookies > 1024)
499                         ncookies = 1024;
500                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
501         } else {
502                 ncookies = -1;
503                 cookies = NULL;
504         }
505         cookie_index = 0;
506
507         hammer2_inode_lock_sh(ip);
508         ipdata = &ip->chain->data->ipdata;
509
510         /*
511          * Handle artificial entries.  To ensure that only positive 64 bit
512          * quantities are returned to userland we always strip off bit 63.
513          * The hash code is designed such that codes 0x0000-0x7FFF are not
514          * used, allowing us to use these codes for articial entries.
515          *
516          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
517          * allow '..' to cross the mount point into (e.g.) the super-root.
518          */
519         error = 0;
520         chain = (void *)(intptr_t)-1;   /* non-NULL for early goto done case */
521
522         if (saveoff == 0) {
523                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
524                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
525                 if (r)
526                         goto done;
527                 if (cookies)
528                         cookies[cookie_index] = saveoff;
529                 ++saveoff;
530                 ++cookie_index;
531                 if (cookie_index == ncookies)
532                         goto done;
533         }
534
535         if (saveoff == 1) {
536                 /*
537                  * Be careful with lockorder when accessing ".."
538                  *
539                  * (ip is the current dir. xip is the parent dir).
540                  */
541                 inum = ipdata->inum & HAMMER2_DIRHASH_USERMSK;
542                 while (ip->pip != NULL && ip != ip->pmp->iroot) {
543                         xip = ip->pip;
544                         hammer2_inode_ref(xip);
545                         hammer2_inode_unlock_sh(ip);
546                         hammer2_inode_lock_sh(xip);
547                         hammer2_inode_lock_sh(ip);
548                         hammer2_inode_drop(xip);
549                         if (xip == ip->pip) {
550                                 inum = xip->chain->data->ipdata.inum &
551                                        HAMMER2_DIRHASH_USERMSK;
552                                 hammer2_inode_unlock_sh(xip);
553                                 break;
554                         }
555                         hammer2_inode_unlock_sh(xip);
556                 }
557                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
558                 if (r)
559                         goto done;
560                 if (cookies)
561                         cookies[cookie_index] = saveoff;
562                 ++saveoff;
563                 ++cookie_index;
564                 if (cookie_index == ncookies)
565                         goto done;
566         }
567
568         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
569
570         /*
571          * parent is the inode chain, already locked for us.  Don't
572          * double lock shared locks as this will screw up upgrades.
573          */
574         if (error) {
575                 goto done;
576         }
577         parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
578         chain = hammer2_chain_lookup(&parent, lkey, lkey,
579                                      HAMMER2_LOOKUP_SHARED);
580         if (chain == NULL) {
581                 chain = hammer2_chain_lookup(&parent,
582                                              lkey, (hammer2_key_t)-1,
583                                              HAMMER2_LOOKUP_SHARED);
584         }
585         while (chain) {
586                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
587                         dtype = hammer2_get_dtype(chain);
588                         saveoff = chain->bref.key & HAMMER2_DIRHASH_USERMSK;
589                         r = vop_write_dirent(&error, uio,
590                                              chain->data->ipdata.inum &
591                                               HAMMER2_DIRHASH_USERMSK,
592                                              dtype,
593                                              chain->data->ipdata.name_len,
594                                              chain->data->ipdata.filename);
595                         if (r)
596                                 break;
597                         if (cookies)
598                                 cookies[cookie_index] = saveoff;
599                         ++cookie_index;
600                 } else {
601                         /* XXX chain error */
602                         kprintf("bad chain type readdir %d\n",
603                                 chain->bref.type);
604                 }
605
606                 /*
607                  * Keys may not be returned in order so once we have a
608                  * placemarker (chain) the scan must allow the full range
609                  * or some entries will be missed.
610                  */
611                 chain = hammer2_chain_next(&parent, chain,
612                                            HAMMER2_DIRHASH_VISIBLE,
613                                            (hammer2_key_t)-1,
614                                            HAMMER2_LOOKUP_SHARED);
615                 if (chain) {
616                         saveoff = (chain->bref.key &
617                                    HAMMER2_DIRHASH_USERMSK) + 1;
618                 } else {
619                         saveoff = (hammer2_key_t)-1;
620                 }
621                 if (cookie_index == ncookies)
622                         break;
623         }
624         if (chain)
625                 hammer2_chain_unlock(chain);
626         hammer2_chain_lookup_done(parent);
627 done:
628         hammer2_inode_unlock_sh(ip);
629         if (ap->a_eofflag)
630                 *ap->a_eofflag = (chain == NULL);
631         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
632         if (error && cookie_index == 0) {
633                 if (cookies) {
634                         kfree(cookies, M_TEMP);
635                         *ap->a_ncookies = 0;
636                         *ap->a_cookies = NULL;
637                 }
638         } else {
639                 if (cookies) {
640                         *ap->a_ncookies = cookie_index;
641                         *ap->a_cookies = cookies;
642                 }
643         }
644         return (error);
645 }
646
647 /*
648  * hammer2_vop_readlink { vp, uio, cred }
649  */
650 static
651 int
652 hammer2_vop_readlink(struct vop_readlink_args *ap)
653 {
654         struct vnode *vp;
655         hammer2_mount_t *hmp;
656         hammer2_inode_t *ip;
657         int error;
658
659         vp = ap->a_vp;
660         if (vp->v_type != VLNK)
661                 return (EINVAL);
662         ip = VTOI(vp);
663         hmp = ip->hmp;
664
665         error = hammer2_read_file(ip, ap->a_uio, 0);
666         return (error);
667 }
668
669 static
670 int
671 hammer2_vop_read(struct vop_read_args *ap)
672 {
673         struct vnode *vp;
674         hammer2_mount_t *hmp;
675         hammer2_inode_t *ip;
676         struct uio *uio;
677         int error;
678         int seqcount;
679         int bigread;
680
681         /*
682          * Read operations supported on this vnode?
683          */
684         vp = ap->a_vp;
685         if (vp->v_type != VREG)
686                 return (EINVAL);
687
688         /*
689          * Misc
690          */
691         ip = VTOI(vp);
692         hmp = ip->hmp;
693         uio = ap->a_uio;
694         error = 0;
695
696         seqcount = ap->a_ioflag >> 16;
697         bigread = (uio->uio_resid > 100 * 1024 * 1024);
698
699         error = hammer2_read_file(ip, uio, seqcount);
700         return (error);
701 }
702
703 static
704 int
705 hammer2_vop_write(struct vop_write_args *ap)
706 {
707         hammer2_mount_t *hmp;
708         hammer2_inode_t *ip;
709         hammer2_trans_t trans;
710         thread_t td;
711         struct vnode *vp;
712         struct uio *uio;
713         int error;
714         int seqcount;
715         int bigwrite;
716
717         /*
718          * Read operations supported on this vnode?
719          */
720         vp = ap->a_vp;
721         if (vp->v_type != VREG)
722                 return (EINVAL);
723
724         /*
725          * Misc
726          */
727         ip = VTOI(vp);
728         hmp = ip->hmp;
729         uio = ap->a_uio;
730         error = 0;
731         if (hmp->ronly)
732                 return (EROFS);
733
734         seqcount = ap->a_ioflag >> 16;
735         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
736
737         /*
738          * Check resource limit
739          */
740         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
741             uio->uio_offset + uio->uio_resid >
742              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
743                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
744                 return (EFBIG);
745         }
746
747         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
748
749         /*
750          * ip must be locked if extending the file.
751          * ip must be locked to avoid racing a truncation.
752          *
753          * ip must be marked modified, particularly because the write
754          * might wind up being copied into the embedded data area.
755          */
756         hammer2_inode_lock_ex(ip);
757         hammer2_trans_init(&trans, ip->hmp);
758         error = hammer2_write_file(ip, &trans, uio, ap->a_ioflag, seqcount);
759         hammer2_inode_unlock_ex(ip);
760         hammer2_trans_done(&trans);
761
762         return (error);
763 }
764
765 /*
766  * Perform read operations on a file or symlink given an UNLOCKED
767  * inode and uio.
768  *
769  * The passed ip is not locked.
770  */
771 static
772 int
773 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
774 {
775         hammer2_off_t size;
776         struct buf *bp;
777         int error;
778
779         error = 0;
780
781         /*
782          * UIO read loop.
783          */
784         hammer2_inode_lock_sh(ip);
785         size = ip->chain->data->ipdata.size;
786
787         while (uio->uio_resid > 0 && uio->uio_offset < size) {
788                 hammer2_key_t lbase;
789                 hammer2_key_t leof;
790                 int lblksize;
791                 int loff;
792                 int n;
793
794                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
795                                                 &lbase, &leof);
796
797                 error = cluster_read(ip->vp, leof, lbase, lblksize,
798                                      uio->uio_resid, seqcount * BKVASIZE,
799                                      &bp);
800
801                 if (error)
802                         break;
803                 loff = (int)(uio->uio_offset - lbase);
804                 n = lblksize - loff;
805                 if (n > uio->uio_resid)
806                         n = uio->uio_resid;
807                 if (n > size - uio->uio_offset)
808                         n = (int)(size - uio->uio_offset);
809                 bp->b_flags |= B_AGE;
810                 uiomove((char *)bp->b_data + loff, n, uio);
811                 bqrelse(bp);
812         }
813         hammer2_inode_unlock_sh(ip);
814         return (error);
815 }
816
817 /*
818  * Called with a locked (ip) to do the underlying write to a file or
819  * to build the symlink target.
820  */
821 static
822 int
823 hammer2_write_file(hammer2_inode_t *ip, hammer2_trans_t *trans,
824                    struct uio *uio, int ioflag, int seqcount)
825 {
826         hammer2_inode_data_t *ipdata;
827         hammer2_key_t old_eof;
828         struct buf *bp;
829         int kflags;
830         int error;
831         int modified = 0;
832
833         /*
834          * Setup if append
835          */
836         ipdata = &ip->chain->data->ipdata;
837         if (ioflag & IO_APPEND)
838                 uio->uio_offset = ipdata->size;
839         kflags = 0;
840         error = 0;
841
842         /*
843          * Extend the file if necessary.  If the write fails at some point
844          * we will truncate it back down to cover as much as we were able
845          * to write.
846          *
847          * Doing this now makes it easier to calculate buffer sizes in
848          * the loop.
849          */
850         KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
851         old_eof = ipdata->size;
852         if (uio->uio_offset + uio->uio_resid > ipdata->size) {
853                 modified = 1;
854                 hammer2_extend_file(trans, ip,
855                                     uio->uio_offset + uio->uio_resid);
856                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
857                 kflags |= NOTE_EXTEND;
858         }
859         KKASSERT(ipdata->type != HAMMER2_OBJTYPE_HARDLINK);
860
861         /*
862          * UIO write loop
863          */
864         while (uio->uio_resid > 0) {
865                 hammer2_key_t lbase;
866                 hammer2_key_t leof;
867                 int trivial;
868                 int lblksize;
869                 int loff;
870                 int n;
871
872                 /*
873                  * Don't allow the buffer build to blow out the buffer
874                  * cache.
875                  */
876                 if ((ioflag & IO_RECURSE) == 0) {
877                         /*
878                          * XXX should try to leave this unlocked through
879                          *      the whole loop
880                          */
881                         hammer2_inode_unlock_ex(ip);
882                         bwillwrite(HAMMER2_PBUFSIZE);
883                         hammer2_inode_lock_ex(ip);
884                         ipdata = &ip->chain->data->ipdata;      /* reload */
885                 }
886
887                 /* XXX bigwrite & signal check test */
888
889                 /*
890                  * This nominally tells us how much we can cluster and
891                  * what the logical buffer size needs to be.  Currently
892                  * we don't try to cluster the write and just handle one
893                  * block at a time.
894                  */
895                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
896                                                 &lbase, &leof);
897                 loff = (int)(uio->uio_offset - lbase);
898
899                 /*
900                  * Calculate bytes to copy this transfer and whether the
901                  * copy completely covers the buffer or not.
902                  */
903                 trivial = 0;
904                 n = lblksize - loff;
905                 if (n > uio->uio_resid) {
906                         n = uio->uio_resid;
907                         if (uio->uio_offset + n == ipdata->size)
908                                 trivial = 1;
909                 } else if (loff == 0) {
910                         trivial = 1;
911                 }
912
913                 /*
914                  * Get the buffer
915                  */
916                 if (uio->uio_segflg == UIO_NOCOPY) {
917                         /*
918                          * Issuing a write with the same data backing the
919                          * buffer.  Instantiate the buffer to collect the
920                          * backing vm pages, then read-in any missing bits.
921                          *
922                          * This case is used by vop_stdputpages().
923                          */
924                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
925                         if ((bp->b_flags & B_CACHE) == 0) {
926                                 bqrelse(bp);
927                                 error = bread(ip->vp, lbase, lblksize, &bp);
928                         }
929                 } else if (trivial) {
930                         /*
931                          * Even though we are entirely overwriting the buffer
932                          * we may still have to zero it out to avoid a
933                          * mmap/write visibility issue.
934                          */
935                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
936                         if ((bp->b_flags & B_CACHE) == 0)
937                                 vfs_bio_clrbuf(bp);
938                 } else {
939                         /*
940                          * Partial overwrite, read in any missing bits then
941                          * replace the portion being written.
942                          *
943                          * (The strategy code will detect zero-fill physical
944                          * blocks for this case).
945                          */
946                         error = bread(ip->vp, lbase, lblksize, &bp);
947                         if (error == 0)
948                                 bheavy(bp);
949                 }
950
951                 if (error) {
952                         brelse(bp);
953                         break;
954                 }
955
956                 /*
957                  * We have to assign physical storage to the buffer we intend
958                  * to dirty or write now to avoid deadlocks in the strategy
959                  * code later.
960                  *
961                  * This can return NOOFFSET for inode-embedded data.  The
962                  * strategy code will take care of it in that case.
963                  */
964                 bp->b_bio2.bio_offset =
965                         hammer2_assign_physical(trans, ip,
966                                                 lbase, lblksize, &error);
967                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
968                 if (error) {
969                         brelse(bp);
970                         break;
971                 }
972
973                 /*
974                  * Ok, copy the data in
975                  */
976                 hammer2_inode_unlock_ex(ip);
977                 error = uiomove(bp->b_data + loff, n, uio);
978                 hammer2_inode_lock_ex(ip);
979                 ipdata = &ip->chain->data->ipdata;      /* reload */
980                 kflags |= NOTE_WRITE;
981                 modified = 1;
982
983                 if (error) {
984                         brelse(bp);
985                         break;
986                 }
987
988                 /* XXX update ip_data.mtime */
989
990                 /*
991                  * Once we dirty a buffer any cached offset becomes invalid.
992                  *
993                  * NOTE: For cluster_write() always use the trailing block
994                  *       size, which is HAMMER2_PBUFSIZE.  lblksize is the
995                  *       eof-straddling blocksize and is incorrect.
996                  */
997                 bp->b_flags |= B_AGE;
998                 if (ioflag & IO_SYNC) {
999                         bwrite(bp);
1000                 } else if ((ioflag & IO_DIRECT) && loff + n == lblksize) {
1001                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1002                                 bp->b_flags |= B_CLUSTEROK;
1003                         bdwrite(bp);
1004                 } else if (ioflag & IO_ASYNC) {
1005                         bawrite(bp);
1006                 } else if (hammer2_cluster_enable) {
1007                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1008                                 bp->b_flags |= B_CLUSTEROK;
1009                         cluster_write(bp, leof, HAMMER2_PBUFSIZE, seqcount);
1010                 } else {
1011                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1012                                 bp->b_flags |= B_CLUSTEROK;
1013                         bdwrite(bp);
1014                 }
1015         }
1016
1017         /*
1018          * Cleanup.  If we extended the file EOF but failed to write through
1019          * the entire write is a failure and we have to back-up.
1020          */
1021         if (error && ipdata->size != old_eof) {
1022                 hammer2_truncate_file(trans, ip, old_eof);
1023                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1024         } else if (modified) {
1025                 hammer2_chain_modify(trans, ip->chain, 0);
1026                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1027                 hammer2_update_time(&ipdata->mtime);
1028         }
1029         hammer2_knote(ip->vp, kflags);
1030
1031         return error;
1032 }
1033
1034 /*
1035  * Assign physical storage to a logical block.  This function creates the
1036  * related meta-data chains representing the data blocks and marks them
1037  * MODIFIED.  We could mark them MOVED instead but ultimately I need to
1038  * XXX code the flusher to check that the related logical buffer is
1039  * flushed.
1040  *
1041  * NOOFFSET is returned if the data is inode-embedded.  In this case the
1042  * strategy code will simply bcopy() the data into the inode.
1043  *
1044  * The inode's delta_dcount is adjusted.
1045  */
1046 static
1047 hammer2_off_t
1048 hammer2_assign_physical(hammer2_trans_t *trans, hammer2_inode_t *ip,
1049                         hammer2_key_t lbase, int lblksize, int *errorp)
1050 {
1051         hammer2_mount_t *hmp;
1052         hammer2_chain_t *parent;
1053         hammer2_chain_t *chain;
1054         hammer2_off_t pbase;
1055
1056         /*
1057          * Locate the chain associated with lbase, return a locked chain.
1058          * However, do not instantiate any data reference (which utilizes a
1059          * device buffer) because we will be using direct IO via the
1060          * logical buffer cache buffer.
1061          */
1062         hmp = ip->hmp;
1063         *errorp = 0;
1064 retry:
1065         hammer2_inode_lock_ex(ip);
1066         parent = hammer2_chain_lookup_init(ip->chain, 0);
1067         chain = hammer2_chain_lookup(&parent,
1068                                      lbase, lbase,
1069                                      HAMMER2_LOOKUP_NODATA);
1070
1071         if (chain == NULL) {
1072                 /*
1073                  * We found a hole, create a new chain entry.
1074                  *
1075                  * NOTE: DATA chains are created without device backing
1076                  *       store (nor do we want any).
1077                  */
1078                 *errorp = hammer2_chain_create(trans, &parent, &chain,
1079                                                lbase, HAMMER2_PBUFRADIX,
1080                                                HAMMER2_BREF_TYPE_DATA,
1081                                                lblksize);
1082                 if (chain == NULL) {
1083                         hammer2_inode_unlock_ex(ip);
1084                         hammer2_chain_lookup_done(parent);
1085                         panic("hammer2_chain_create: par=%p error=%d\n",
1086                                 parent, *errorp);
1087                         goto retry;
1088                 }
1089
1090                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1091                 /*ip->delta_dcount += lblksize;*/
1092         } else {
1093                 switch (chain->bref.type) {
1094                 case HAMMER2_BREF_TYPE_INODE:
1095                         /*
1096                          * The data is embedded in the inode.  The
1097                          * caller is responsible for marking the inode
1098                          * modified and copying the data to the embedded
1099                          * area.
1100                          */
1101                         pbase = NOOFFSET;
1102                         break;
1103                 case HAMMER2_BREF_TYPE_DATA:
1104                         if (chain->bytes != lblksize) {
1105                                 panic("hammer2_assign_physical: "
1106                                       "size mismatch %d/%d\n",
1107                                       lblksize, chain->bytes);
1108                         }
1109                         hammer2_chain_modify(trans, chain,
1110                                              HAMMER2_MODIFY_OPTDATA);
1111                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
1112                         break;
1113                 default:
1114                         panic("hammer2_assign_physical: bad type");
1115                         /* NOT REACHED */
1116                         pbase = NOOFFSET;
1117                         break;
1118                 }
1119         }
1120         if (chain)
1121                 hammer2_chain_unlock(chain);
1122         hammer2_chain_lookup_done(parent);
1123
1124         hammer2_inode_unlock_ex(ip);
1125
1126         return (pbase);
1127 }
1128
1129 /*
1130  * Truncate the size of a file.
1131  *
1132  * This routine adjusts ipdata->size smaller, destroying any related
1133  * data beyond the new EOF and potentially resizing the block straddling
1134  * the EOF.
1135  *
1136  * The inode must be locked.
1137  */
1138 static
1139 void
1140 hammer2_truncate_file(hammer2_trans_t *trans,
1141                       hammer2_inode_t *ip, hammer2_key_t nsize)
1142 {
1143         hammer2_inode_data_t *ipdata;
1144         hammer2_chain_t *parent;
1145         hammer2_chain_t *chain;
1146         hammer2_key_t lbase;
1147         hammer2_key_t leof;
1148         struct buf *bp;
1149         int loff;
1150         int error;
1151         int oblksize;
1152         int nblksize;
1153
1154         hammer2_chain_modify(trans, ip->chain, 0);
1155         bp = NULL;
1156         ipdata = &ip->chain->data->ipdata;
1157         error = 0;
1158
1159         /*
1160          * Destroy any logical buffer cache buffers beyond the file EOF.
1161          *
1162          * We call nvtruncbuf() w/ trivial == 1 to prevent it from messing
1163          * around with the buffer straddling EOF, because we need to assign
1164          * a new physical offset to it.
1165          */
1166         if (ip->vp) {
1167                 nvtruncbuf(ip->vp, nsize,
1168                            HAMMER2_PBUFSIZE, (int)nsize & HAMMER2_PBUFMASK,
1169                            1);
1170         }
1171
1172         /*
1173          * Setup for lookup/search
1174          */
1175         parent = hammer2_chain_lookup_init(ip->chain, 0);
1176
1177         /*
1178          * Handle the case where a chain/logical-buffer straddles the new
1179          * EOF.  We told nvtruncbuf() above not to mess with the logical
1180          * buffer straddling the EOF because we need to reassign its storage
1181          * and can't let the strategy code do it for us.
1182          */
1183         loff = (int)nsize & HAMMER2_PBUFMASK;
1184         if (loff && ip->vp) {
1185                 oblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1186                 error = bread(ip->vp, lbase, oblksize, &bp);
1187                 KKASSERT(error == 0);
1188         }
1189         ipdata->size = nsize;
1190         nblksize = hammer2_calc_logical(ip, nsize, &lbase, &leof);
1191
1192         /*
1193          * Fixup the chain element.  If we have a logical buffer in-hand
1194          * we don't want to create a conflicting device buffer.
1195          */
1196         if (loff && bp) {
1197                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
1198                                              HAMMER2_LOOKUP_NODATA);
1199                 if (chain) {
1200                         switch(chain->bref.type) {
1201                         case HAMMER2_BREF_TYPE_DATA:
1202                                 hammer2_chain_resize(trans, ip, bp,
1203                                              parent, &chain,
1204                                              hammer2_allocsize(nblksize),
1205                                              HAMMER2_MODIFY_OPTDATA);
1206                                 allocbuf(bp, nblksize);
1207                                 bzero(bp->b_data + loff, nblksize - loff);
1208                                 bp->b_bio2.bio_offset = chain->bref.data_off &
1209                                                         HAMMER2_OFF_MASK;
1210                                 break;
1211                         case HAMMER2_BREF_TYPE_INODE:
1212                                 allocbuf(bp, nblksize);
1213                                 bzero(bp->b_data + loff, nblksize - loff);
1214                                 bp->b_bio2.bio_offset = NOOFFSET;
1215                                 break;
1216                         default:
1217                                 panic("hammer2_truncate_file: bad type");
1218                                 break;
1219                         }
1220                         hammer2_chain_unlock(chain);
1221                         if (bp->b_bcount == HAMMER2_PBUFSIZE)
1222                                 bp->b_flags |= B_CLUSTEROK;
1223                         bdwrite(bp);
1224                 } else {
1225                         /*
1226                          * Destroy clean buffer w/ wrong buffer size.  Retain
1227                          * backing store.
1228                          */
1229                         bp->b_flags |= B_RELBUF;
1230                         KKASSERT(bp->b_bio2.bio_offset == NOOFFSET);
1231                         KKASSERT((bp->b_flags & B_DIRTY) == 0);
1232                         bqrelse(bp);
1233                 }
1234         } else if (loff) {
1235                 /*
1236                  * WARNING: This utilizes a device buffer for the data.
1237                  *
1238                  * This case should not occur because file truncations without
1239                  * a vnode (and hence no logical buffer cache) should only
1240                  * always truncate to 0-length.
1241                  */
1242                 panic("hammer2_truncate_file: non-zero truncation, no-vnode");
1243 #if 0
1244                 chain = hammer2_chain_lookup(&parent, lbase, lbase, 0);
1245                 if (chain) {
1246                         switch(chain->bref.type) {
1247                         case HAMMER2_BREF_TYPE_DATA:
1248                                 chain = hammer2_chain_resize(trans, ip, bp,
1249                                              parent, chain,
1250                                              hammer2_allocsize(nblksize),
1251                                              0);
1252                                 hammer2_chain_modify(hmp, chain, 0);
1253                                 bzero(chain->data->buf + loff, nblksize - loff);
1254                                 break;
1255                         case HAMMER2_BREF_TYPE_INODE:
1256                                 if (loff < HAMMER2_EMBEDDED_BYTES) {
1257                                         hammer2_chain_modify(hmp, chain, 0);
1258                                         bzero(chain->data->ipdata.u.data + loff,
1259                                               HAMMER2_EMBEDDED_BYTES - loff);
1260                                 }
1261                                 break;
1262                         }
1263                         hammer2_chain_unlock(chain);
1264                 }
1265 #endif
1266         }
1267
1268         /*
1269          * Clean up any fragmentory VM pages now that we have properly
1270          * resized the straddling buffer.  These pages are no longer
1271          * part of the buffer.
1272          */
1273         if (ip->vp) {
1274                 nvtruncbuf(ip->vp, nsize,
1275                            nblksize, (int)nsize & (nblksize - 1),
1276                            1);
1277         }
1278
1279         /*
1280          * Destroy any physical blocks after the new EOF point.
1281          */
1282         lbase = (nsize + HAMMER2_PBUFMASK64) & ~HAMMER2_PBUFMASK64;
1283         chain = hammer2_chain_lookup(&parent,
1284                                      lbase, (hammer2_key_t)-1,
1285                                      HAMMER2_LOOKUP_NODATA);
1286         while (chain) {
1287                 /*
1288                  * Degenerate embedded data case, nothing to loop on.
1289                  */
1290                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1291                         hammer2_chain_unlock(chain);
1292                         break;
1293                 }
1294
1295                 /*
1296                  * Delete physical data blocks past the file EOF.
1297                  */
1298                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1299                         /*ip->delta_dcount -= chain->bytes;*/
1300                         hammer2_chain_delete(trans, parent, chain);
1301                 }
1302                 /* XXX check parent if empty indirect block & delete */
1303                 chain = hammer2_chain_next(&parent, chain,
1304                                            lbase, (hammer2_key_t)-1,
1305                                            HAMMER2_LOOKUP_NODATA);
1306         }
1307         hammer2_chain_lookup_done(parent);
1308 }
1309
1310 /*
1311  * Extend the size of a file.  The inode must be locked.
1312  *
1313  * We may have to resize the block straddling the old EOF.
1314  */
1315 static
1316 void
1317 hammer2_extend_file(hammer2_trans_t *trans,
1318                     hammer2_inode_t *ip, hammer2_key_t nsize)
1319 {
1320         hammer2_inode_data_t *ipdata;
1321         hammer2_mount_t *hmp;
1322         hammer2_chain_t *parent;
1323         hammer2_chain_t *chain;
1324         struct buf *bp;
1325         hammer2_key_t osize;
1326         hammer2_key_t obase;
1327         hammer2_key_t nbase;
1328         hammer2_key_t leof;
1329         int oblksize;
1330         int nblksize;
1331         int nradix;
1332         int error;
1333
1334         KKASSERT(ip->vp);
1335         hmp = ip->hmp;
1336
1337         hammer2_chain_modify(trans, ip->chain, 0);
1338         ipdata = &ip->chain->data->ipdata;
1339
1340         /*
1341          * Nothing to do if the direct-data case is still intact
1342          */
1343         if ((ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1344             nsize <= HAMMER2_EMBEDDED_BYTES) {
1345                 ipdata->size = nsize;
1346                 nvextendbuf(ip->vp,
1347                             ipdata->size, nsize,
1348                             0, HAMMER2_EMBEDDED_BYTES,
1349                             0, (int)nsize,
1350                             1);
1351                 /* ipdata = &ip->chain->data->ipdata; RELOAD */
1352                 return;
1353         }
1354
1355         /*
1356          * Calculate the blocksize at the original EOF and resize the block
1357          * if necessary.  Adjust the file size in the inode.
1358          */
1359         osize = ipdata->size;
1360         oblksize = hammer2_calc_logical(ip, osize, &obase, &leof);
1361         ipdata->size = nsize;
1362         nblksize = hammer2_calc_logical(ip, osize, &nbase, &leof);
1363
1364         /*
1365          * Do all required vnode operations, but do not mess with the
1366          * buffer straddling the orignal EOF.
1367          */
1368         nvextendbuf(ip->vp,
1369                     ipdata->size, nsize,
1370                     0, nblksize,
1371                     0, (int)nsize & HAMMER2_PBUFMASK,
1372                     1);
1373         ipdata = &ip->chain->data->ipdata;
1374
1375         /*
1376          * Early return if we have no more work to do.
1377          */
1378         if (obase == nbase && oblksize == nblksize &&
1379             (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) == 0) {
1380                 return;
1381         }
1382
1383         /*
1384          * We have work to do, including possibly resizing the buffer
1385          * at the previous EOF point and turning off DIRECTDATA mode.
1386          */
1387         bp = NULL;
1388         if (((int)osize & HAMMER2_PBUFMASK)) {
1389                 error = bread(ip->vp, obase, oblksize, &bp);
1390                 KKASSERT(error == 0);
1391         }
1392
1393         /*
1394          * Disable direct-data mode by loading up a buffer cache buffer
1395          * with the data, then converting the inode data area into the
1396          * inode indirect block array area.
1397          */
1398         if (ipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1399                 ipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1400                 bzero(&ipdata->u.blockset, sizeof(ipdata->u.blockset));
1401         }
1402
1403         /*
1404          * Resize the chain element at the old EOF.
1405          */
1406         if (((int)osize & HAMMER2_PBUFMASK)) {
1407 retry:
1408                 error = 0;
1409                 parent = hammer2_chain_lookup_init(ip->chain, 0);
1410                 nradix = hammer2_allocsize(nblksize);
1411
1412                 chain = hammer2_chain_lookup(&parent,
1413                                              obase, obase,
1414                                              HAMMER2_LOOKUP_NODATA);
1415                 if (chain == NULL) {
1416                         error = hammer2_chain_create(trans, &parent, &chain,
1417                                                      obase, nblksize,
1418                                                      HAMMER2_BREF_TYPE_DATA,
1419                                                      nblksize);
1420                         if (chain == NULL) {
1421                                 hammer2_chain_lookup_done(parent);
1422                                 panic("hammer2_chain_create: par=%p error=%d\n",
1423                                         parent, error);
1424                                 goto retry;
1425                         }
1426                         /*ip->delta_dcount += nblksize;*/
1427                 } else {
1428                         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA);
1429                         hammer2_chain_resize(trans, ip, bp,
1430                                              parent, &chain,
1431                                              nradix,
1432                                              HAMMER2_MODIFY_OPTDATA);
1433                 }
1434                 if (obase != nbase) {
1435                         if (oblksize != HAMMER2_PBUFSIZE)
1436                                 allocbuf(bp, HAMMER2_PBUFSIZE);
1437                 } else {
1438                         if (oblksize != nblksize)
1439                                 allocbuf(bp, nblksize);
1440                 }
1441                 bp->b_bio2.bio_offset = chain->bref.data_off &
1442                                         HAMMER2_OFF_MASK;
1443                 hammer2_chain_unlock(chain);
1444                 if (bp->b_bcount == HAMMER2_PBUFSIZE)
1445                         bp->b_flags |= B_CLUSTEROK;
1446                 bdwrite(bp);
1447                 hammer2_chain_lookup_done(parent);  /* must be after bdwrite */
1448         }
1449 }
1450
1451 static
1452 int
1453 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1454 {
1455         hammer2_inode_t *ip;
1456         hammer2_inode_t *dip;
1457         hammer2_mount_t *hmp;
1458         hammer2_chain_t *parent;
1459         hammer2_chain_t *chain;
1460         hammer2_chain_t *ochain;
1461         hammer2_trans_t trans;
1462         struct namecache *ncp;
1463         const uint8_t *name;
1464         size_t name_len;
1465         hammer2_key_t lhc;
1466         int error = 0;
1467         struct vnode *vp;
1468
1469         dip = VTOI(ap->a_dvp);
1470         hmp = dip->hmp;
1471         ncp = ap->a_nch->ncp;
1472         name = ncp->nc_name;
1473         name_len = ncp->nc_nlen;
1474         lhc = hammer2_dirhash(name, name_len);
1475
1476         /*
1477          * Note: In DragonFly the kernel handles '.' and '..'.
1478          */
1479         hammer2_inode_lock_sh(dip);
1480         parent = hammer2_chain_lookup_init(dip->chain, HAMMER2_LOOKUP_SHARED);
1481         chain = hammer2_chain_lookup(&parent,
1482                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1483                                      HAMMER2_LOOKUP_SHARED);
1484         while (chain) {
1485                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1486                     name_len == chain->data->ipdata.name_len &&
1487                     bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
1488                         break;
1489                 }
1490                 chain = hammer2_chain_next(&parent, chain,
1491                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1492                                            HAMMER2_LOOKUP_SHARED);
1493         }
1494         hammer2_chain_lookup_done(parent);
1495         hammer2_inode_unlock_sh(dip);
1496
1497         /*
1498          * If the inode represents a forwarding entry for a hardlink we have
1499          * to locate the actual inode.  The original ip is saved for possible
1500          * deconsolidation.  (ip) will only be set to non-NULL when we have
1501          * to locate the real file via a hardlink.  ip will be referenced but
1502          * not locked in that situation.  chain is passed in locked and
1503          * returned locked.
1504          *
1505          * XXX what kind of chain lock?
1506          */
1507         ochain = NULL;
1508         if (chain && chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
1509                 error = hammer2_hardlink_find(dip, &chain, &ochain);
1510                 if (error) {
1511                         kprintf("hammer2: unable to find hardlink\n");
1512                         if (chain) {
1513                                 hammer2_chain_unlock(chain);
1514                                 chain = NULL;
1515                         }
1516                         goto failed;
1517                 }
1518         }
1519
1520         /*
1521          * Deconsolidate any hardlink whos nlinks == 1.  Ignore errors.
1522          * If an error occurs chain and ip are left alone.
1523          *
1524          * XXX upgrade shared lock?
1525          */
1526         if (ochain && chain && chain->data->ipdata.nlinks == 1 && !hmp->ronly) {
1527                 kprintf("hammer2: need to unconsolidate hardlink for %s\n",
1528                         chain->data->ipdata.filename);
1529                 /* XXX retain shared lock on dip? (currently not held) */
1530                 hammer2_trans_init(&trans, dip->hmp);
1531                 hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain);
1532                 hammer2_trans_done(&trans);
1533         }
1534
1535         /*
1536          * Acquire the related vnode
1537          *
1538          * NOTE: For error processing, only ENOENT resolves the namecache
1539          *       entry to NULL, otherwise we just return the error and
1540          *       leave the namecache unresolved.
1541          *
1542          * NOTE: multiple hammer2_inode structures can be aliased to the
1543          *       same chain element, for example for hardlinks.  This
1544          *       use case does not 'reattach' inode associations that
1545          *       might already exist, but always allocates a new one.
1546          *
1547          * WARNING: inode structure is locked exclusively via inode_get
1548          *          but chain was locked shared.  inode_unlock_ex()
1549          *          will handle it properly.
1550          */
1551         if (chain) {
1552                 ip = hammer2_inode_get(hmp, dip->pmp, dip, chain);
1553                 vp = hammer2_igetv(ip, &error);
1554                 if (error == 0) {
1555                         vn_unlock(vp);
1556                         cache_setvp(ap->a_nch, vp);
1557                 } else if (error == ENOENT) {
1558                         cache_setvp(ap->a_nch, NULL);
1559                 }
1560                 hammer2_inode_unlock_ex(ip);
1561
1562                 /*
1563                  * The vp should not be released until after we've disposed
1564                  * of our locks, because it might cause vop_inactive() to
1565                  * be called.
1566                  */
1567                 if (vp)
1568                         vrele(vp);
1569         } else {
1570                 error = ENOENT;
1571                 cache_setvp(ap->a_nch, NULL);
1572         }
1573 failed:
1574         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1575                 ("resolve error %d/%p chain %p ap %p\n",
1576                  error, ap->a_nch->ncp->nc_vp, chain, ap));
1577         if (ochain)
1578                 hammer2_chain_drop(ochain);
1579         return error;
1580 }
1581
1582 static
1583 int
1584 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1585 {
1586         hammer2_inode_t *dip;
1587         hammer2_inode_t *ip;
1588         hammer2_mount_t *hmp;
1589         int error;
1590
1591         dip = VTOI(ap->a_dvp);
1592         hmp = dip->hmp;
1593
1594         if ((ip = dip->pip) == NULL) {
1595                 *ap->a_vpp = NULL;
1596                 return ENOENT;
1597         }
1598         hammer2_inode_lock_ex(ip);
1599         *ap->a_vpp = hammer2_igetv(ip, &error);
1600         hammer2_inode_unlock_ex(ip);
1601
1602         return error;
1603 }
1604
1605 static
1606 int
1607 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1608 {
1609         hammer2_mount_t *hmp;
1610         hammer2_inode_t *dip;
1611         hammer2_inode_t *nip;
1612         hammer2_trans_t trans;
1613         struct namecache *ncp;
1614         const uint8_t *name;
1615         size_t name_len;
1616         int error;
1617
1618         dip = VTOI(ap->a_dvp);
1619         hmp = dip->hmp;
1620         if (hmp->ronly)
1621                 return (EROFS);
1622
1623         ncp = ap->a_nch->ncp;
1624         name = ncp->nc_name;
1625         name_len = ncp->nc_nlen;
1626
1627         hammer2_trans_init(&trans, hmp);
1628         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1629                                    name, name_len, &error);
1630         if (error) {
1631                 KKASSERT(nip == NULL);
1632                 *ap->a_vpp = NULL;
1633         } else {
1634                 *ap->a_vpp = hammer2_igetv(nip, &error);
1635                 hammer2_inode_unlock_ex(nip);
1636         }
1637         hammer2_trans_done(&trans);
1638
1639         if (error == 0) {
1640                 cache_setunresolved(ap->a_nch);
1641                 cache_setvp(ap->a_nch, *ap->a_vpp);
1642         }
1643         return error;
1644 }
1645
1646 /*
1647  * Return the largest contiguous physical disk range for the logical
1648  * request.
1649  *
1650  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
1651  */
1652 static
1653 int
1654 hammer2_vop_bmap(struct vop_bmap_args *ap)
1655 {
1656         struct vnode *vp;
1657         hammer2_mount_t *hmp;
1658         hammer2_inode_t *ip;
1659         hammer2_chain_t *parent;
1660         hammer2_chain_t *chain;
1661         hammer2_key_t lbeg;
1662         hammer2_key_t lend;
1663         hammer2_off_t pbeg;
1664         hammer2_off_t pbytes;
1665         hammer2_off_t array[HAMMER2_BMAP_COUNT][2];
1666         int loff;
1667         int ai;
1668
1669         /*
1670          * Only supported on regular files
1671          *
1672          * Only supported for read operations (required for cluster_read).
1673          * The block allocation is delayed for write operations.
1674          */
1675         vp = ap->a_vp;
1676         if (vp->v_type != VREG)
1677                 return (EOPNOTSUPP);
1678         if (ap->a_cmd != BUF_CMD_READ)
1679                 return (EOPNOTSUPP);
1680
1681         ip = VTOI(vp);
1682         hmp = ip->hmp;
1683         bzero(array, sizeof(array));
1684
1685         /*
1686          * Calculate logical range
1687          */
1688         KKASSERT((ap->a_loffset & HAMMER2_LBUFMASK64) == 0);
1689         lbeg = ap->a_loffset & HAMMER2_OFF_MASK_HI;
1690         lend = lbeg + HAMMER2_BMAP_COUNT * HAMMER2_PBUFSIZE - 1;
1691         if (lend < lbeg)
1692                 lend = lbeg;
1693         loff = ap->a_loffset & HAMMER2_OFF_MASK_LO;
1694
1695         hammer2_inode_lock_sh(ip);
1696         parent = hammer2_chain_lookup_init(ip->chain, HAMMER2_LOOKUP_SHARED);
1697         chain = hammer2_chain_lookup(&parent,
1698                                      lbeg, lend,
1699                                      HAMMER2_LOOKUP_NODATA |
1700                                      HAMMER2_LOOKUP_SHARED);
1701         if (chain == NULL) {
1702                 *ap->a_doffsetp = ZFOFFSET;
1703                 hammer2_chain_lookup_done(parent);
1704                 hammer2_inode_unlock_sh(ip);
1705                 return (0);
1706         }
1707
1708         while (chain) {
1709                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1710                         ai = (chain->bref.key - lbeg) / HAMMER2_PBUFSIZE;
1711                         KKASSERT(ai >= 0 && ai < HAMMER2_BMAP_COUNT);
1712                         array[ai][0] = chain->bref.data_off & HAMMER2_OFF_MASK;
1713                         array[ai][1] = chain->bytes;
1714                 }
1715                 chain = hammer2_chain_next(&parent, chain,
1716                                            lbeg, lend,
1717                                            HAMMER2_LOOKUP_NODATA |
1718                                            HAMMER2_LOOKUP_SHARED);
1719         }
1720         hammer2_chain_lookup_done(parent);
1721         hammer2_inode_unlock_sh(ip);
1722
1723         /*
1724          * If the requested loffset is not mappable physically we can't
1725          * bmap.  The caller will have to access the file data via a
1726          * device buffer.
1727          */
1728         if (array[0][0] == 0 || array[0][1] < loff + HAMMER2_LBUFSIZE) {
1729                 *ap->a_doffsetp = NOOFFSET;
1730                 return (0);
1731         }
1732
1733         /*
1734          * Calculate the physical disk offset range for array[0]
1735          */
1736         pbeg = array[0][0] + loff;
1737         pbytes = array[0][1] - loff;
1738
1739         for (ai = 1; ai < HAMMER2_BMAP_COUNT; ++ai) {
1740                 if (array[ai][0] != pbeg + pbytes)
1741                         break;
1742                 pbytes += array[ai][1];
1743         }
1744
1745         *ap->a_doffsetp = pbeg;
1746         if (ap->a_runp)
1747                 *ap->a_runp = pbytes;
1748         return (0);
1749 }
1750
1751 static
1752 int
1753 hammer2_vop_open(struct vop_open_args *ap)
1754 {
1755         return vop_stdopen(ap);
1756 }
1757
1758 /*
1759  * hammer2_vop_advlock { vp, id, op, fl, flags }
1760  */
1761 static
1762 int
1763 hammer2_vop_advlock(struct vop_advlock_args *ap)
1764 {
1765         hammer2_inode_t *ip = VTOI(ap->a_vp);
1766         hammer2_off_t size;
1767
1768         hammer2_inode_lock_sh(ip);
1769         size = ip->chain->data->ipdata.size;
1770         hammer2_inode_unlock_sh(ip);
1771         return (lf_advlock(ap, &ip->advlock, size));
1772 }
1773
1774
1775 static
1776 int
1777 hammer2_vop_close(struct vop_close_args *ap)
1778 {
1779         return vop_stdclose(ap);
1780 }
1781
1782 /*
1783  * hammer2_vop_nlink { nch, dvp, vp, cred }
1784  *
1785  * Create a hardlink from (vp) to {dvp, nch}.
1786  */
1787 static
1788 int
1789 hammer2_vop_nlink(struct vop_nlink_args *ap)
1790 {
1791         hammer2_inode_t *dip;   /* target directory to create link in */
1792         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1793         hammer2_mount_t *hmp;
1794         hammer2_chain_t *chain;
1795         hammer2_trans_t trans;
1796         struct namecache *ncp;
1797         const uint8_t *name;
1798         size_t name_len;
1799         int error;
1800
1801         dip = VTOI(ap->a_dvp);
1802         hmp = dip->hmp;
1803         if (hmp->ronly)
1804                 return (EROFS);
1805
1806         ncp = ap->a_nch->ncp;
1807         name = ncp->nc_name;
1808         name_len = ncp->nc_nlen;
1809         hammer2_trans_init(&trans, hmp);
1810
1811         /*
1812          * ip represents the file being hardlinked.  The file could be a
1813          * normal file or a hardlink target if it has already been hardlinked.
1814          * If ip is a hardlinked target then ip->pip represents the location
1815          * of the hardlinked target, NOT the location of the hardlink pointer.
1816          *
1817          * Bump nlinks and potentially also create or move the hardlink
1818          * target in the parent directory common to (ip) and (dip).  The
1819          * consolidation code can modify ip->chain and ip->pip.  The
1820          * returned chain is locked.
1821          */
1822         ip = VTOI(ap->a_vp);
1823         hammer2_inode_lock_ex(ip);
1824         error = hammer2_hardlink_consolidate(&trans, ip, &chain, dip, 1);
1825         if (error)
1826                 goto done;
1827
1828         /*
1829          * Create a directory entry connected to the specified chain.
1830          * The hardlink consolidation code has already adjusted ip->pip
1831          * to the common parent directory containing the actual hardlink
1832          *
1833          * (which may be different from dip where we created our hardlink
1834          * entry. ip->chain always represents the actual hardlink and not
1835          * any of the pointers to the actual hardlink).
1836          */
1837         error = hammer2_inode_connect(&trans, 1,
1838                                       dip, &chain,
1839                                       name, name_len);
1840         if (error == 0) {
1841                 cache_setunresolved(ap->a_nch);
1842                 cache_setvp(ap->a_nch, ap->a_vp);
1843         }
1844 done:
1845         if (chain)
1846                 hammer2_chain_unlock(chain);
1847         hammer2_inode_unlock_ex(ip);
1848         hammer2_trans_done(&trans);
1849
1850         return error;
1851 }
1852
1853 /*
1854  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1855  *
1856  * The operating system has already ensured that the directory entry
1857  * does not exist and done all appropriate namespace locking.
1858  */
1859 static
1860 int
1861 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1862 {
1863         hammer2_mount_t *hmp;
1864         hammer2_inode_t *dip;
1865         hammer2_inode_t *nip;
1866         hammer2_trans_t trans;
1867         struct namecache *ncp;
1868         const uint8_t *name;
1869         size_t name_len;
1870         int error;
1871
1872         dip = VTOI(ap->a_dvp);
1873         hmp = dip->hmp;
1874         if (hmp->ronly)
1875                 return (EROFS);
1876
1877         ncp = ap->a_nch->ncp;
1878         name = ncp->nc_name;
1879         name_len = ncp->nc_nlen;
1880         hammer2_trans_init(&trans, hmp);
1881
1882         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1883                                    name, name_len, &error);
1884         if (error) {
1885                 KKASSERT(nip == NULL);
1886                 *ap->a_vpp = NULL;
1887         } else {
1888                 *ap->a_vpp = hammer2_igetv(nip, &error);
1889                 hammer2_inode_unlock_ex(nip);
1890         }
1891         hammer2_trans_done(&trans);
1892
1893         if (error == 0) {
1894                 cache_setunresolved(ap->a_nch);
1895                 cache_setvp(ap->a_nch, *ap->a_vpp);
1896         }
1897         return error;
1898 }
1899
1900 /*
1901  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1902  */
1903 static
1904 int
1905 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1906 {
1907         hammer2_mount_t *hmp;
1908         hammer2_inode_t *dip;
1909         hammer2_inode_t *nip;
1910         hammer2_trans_t trans;
1911         struct namecache *ncp;
1912         const uint8_t *name;
1913         size_t name_len;
1914         int error;
1915
1916         dip = VTOI(ap->a_dvp);
1917         hmp = dip->hmp;
1918         if (hmp->ronly)
1919                 return (EROFS);
1920
1921         ncp = ap->a_nch->ncp;
1922         name = ncp->nc_name;
1923         name_len = ncp->nc_nlen;
1924         hammer2_trans_init(&trans, hmp);
1925
1926         ap->a_vap->va_type = VLNK;      /* enforce type */
1927
1928         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1929                                    name, name_len, &error);
1930         if (error) {
1931                 KKASSERT(nip == NULL);
1932                 *ap->a_vpp = NULL;
1933                 hammer2_trans_done(&trans);
1934                 return error;
1935         }
1936         *ap->a_vpp = hammer2_igetv(nip, &error);
1937
1938         /*
1939          * Build the softlink (~like file data) and finalize the namecache.
1940          */
1941         if (error == 0) {
1942                 size_t bytes;
1943                 struct uio auio;
1944                 struct iovec aiov;
1945                 hammer2_inode_data_t *nipdata;
1946
1947                 nipdata = &nip->chain->data->ipdata;
1948                 bytes = strlen(ap->a_target);
1949
1950                 if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1951                         KKASSERT(nipdata->op_flags &
1952                                  HAMMER2_OPFLAG_DIRECTDATA);
1953                         bcopy(ap->a_target, nipdata->u.data, bytes);
1954                         nipdata->size = bytes;
1955                 } else {
1956                         bzero(&auio, sizeof(auio));
1957                         bzero(&aiov, sizeof(aiov));
1958                         auio.uio_iov = &aiov;
1959                         auio.uio_segflg = UIO_SYSSPACE;
1960                         auio.uio_rw = UIO_WRITE;
1961                         auio.uio_resid = bytes;
1962                         auio.uio_iovcnt = 1;
1963                         auio.uio_td = curthread;
1964                         aiov.iov_base = ap->a_target;
1965                         aiov.iov_len = bytes;
1966                         error = hammer2_write_file(nip, &trans,
1967                                                    &auio, IO_APPEND, 0);
1968                         nipdata = &nip->chain->data->ipdata; /* RELOAD */
1969                         /* XXX handle error */
1970                         error = 0;
1971                 }
1972         }
1973         hammer2_inode_unlock_ex(nip);
1974         hammer2_trans_done(&trans);
1975
1976         /*
1977          * Finalize namecache
1978          */
1979         if (error == 0) {
1980                 cache_setunresolved(ap->a_nch);
1981                 cache_setvp(ap->a_nch, *ap->a_vpp);
1982                 /* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
1983         }
1984         return error;
1985 }
1986
1987 /*
1988  * hammer2_vop_nremove { nch, dvp, cred }
1989  */
1990 static
1991 int
1992 hammer2_vop_nremove(struct vop_nremove_args *ap)
1993 {
1994         hammer2_inode_t *dip;
1995         hammer2_mount_t *hmp;
1996         hammer2_trans_t trans;
1997         struct namecache *ncp;
1998         const uint8_t *name;
1999         size_t name_len;
2000         int error;
2001
2002         dip = VTOI(ap->a_dvp);
2003         hmp = dip->hmp;
2004         if (hmp->ronly)
2005                 return(EROFS);
2006
2007         ncp = ap->a_nch->ncp;
2008         name = ncp->nc_name;
2009         name_len = ncp->nc_nlen;
2010         hammer2_trans_init(&trans, hmp);
2011         error = hammer2_unlink_file(&trans, dip, name, name_len, 0, NULL);
2012         hammer2_trans_done(&trans);
2013         if (error == 0) {
2014                 cache_unlink(ap->a_nch);
2015         }
2016         return (error);
2017 }
2018
2019 /*
2020  * hammer2_vop_nrmdir { nch, dvp, cred }
2021  */
2022 static
2023 int
2024 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
2025 {
2026         hammer2_inode_t *dip;
2027         hammer2_mount_t *hmp;
2028         hammer2_trans_t trans;
2029         struct namecache *ncp;
2030         const uint8_t *name;
2031         size_t name_len;
2032         int error;
2033
2034         dip = VTOI(ap->a_dvp);
2035         hmp = dip->hmp;
2036         if (hmp->ronly)
2037                 return(EROFS);
2038
2039         ncp = ap->a_nch->ncp;
2040         name = ncp->nc_name;
2041         name_len = ncp->nc_nlen;
2042
2043         hammer2_trans_init(&trans, hmp);
2044         error = hammer2_unlink_file(&trans, dip, name, name_len, 1, NULL);
2045         hammer2_trans_done(&trans);
2046         if (error == 0) {
2047                 cache_unlink(ap->a_nch);
2048         }
2049         return (error);
2050 }
2051
2052 /*
2053  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
2054  */
2055 static
2056 int
2057 hammer2_vop_nrename(struct vop_nrename_args *ap)
2058 {
2059         struct namecache *fncp;
2060         struct namecache *tncp;
2061         hammer2_inode_t *fdip;
2062         hammer2_inode_t *tdip;
2063         hammer2_inode_t *ip;
2064         hammer2_chain_t *chain;
2065         hammer2_mount_t *hmp;
2066         hammer2_trans_t trans;
2067         const uint8_t *fname;
2068         size_t fname_len;
2069         const uint8_t *tname;
2070         size_t tname_len;
2071         int error;
2072         int hlink;
2073
2074         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
2075                 return(EXDEV);
2076         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
2077                 return(EXDEV);
2078
2079         fdip = VTOI(ap->a_fdvp);        /* source directory */
2080         tdip = VTOI(ap->a_tdvp);        /* target directory */
2081
2082         hmp = fdip->hmp;                /* check read-only filesystem */
2083         if (hmp->ronly)
2084                 return(EROFS);
2085
2086         fncp = ap->a_fnch->ncp;         /* entry name in source */
2087         fname = fncp->nc_name;
2088         fname_len = fncp->nc_nlen;
2089
2090         tncp = ap->a_tnch->ncp;         /* entry name in target */
2091         tname = tncp->nc_name;
2092         tname_len = tncp->nc_nlen;
2093
2094         hammer2_trans_init(&trans, hmp);
2095
2096         /*
2097          * ip is the inode being renamed.  If this is a hardlink then
2098          * ip represents the actual file and not the hardlink marker.
2099          */
2100         ip = VTOI(fncp->nc_vp);
2101         chain = NULL;
2102
2103         /*
2104          * Keep a tight grip on the inode so the temporary unlinking from
2105          * the source location prior to linking to the target location
2106          * does not cause the chain to be destroyed.
2107          *
2108          * NOTE: To avoid deadlocks we cannot lock (ip) while we are
2109          *       unlinking elements from their directories.  Locking
2110          *       the nlinks field does not lock the whole inode.
2111          */
2112         hammer2_inode_ref(ip);
2113
2114         /*
2115          * Remove target if it exists
2116          */
2117         error = hammer2_unlink_file(&trans, tdip, tname, tname_len, -1, NULL);
2118         if (error && error != ENOENT)
2119                 goto done;
2120         cache_setunresolved(ap->a_tnch);
2121
2122         /*
2123          * When renaming a hardlinked file we may have to re-consolidate
2124          * the location of the hardlink target.  Since the element is simply
2125          * being moved, nlinks is not modified in this case.
2126          *
2127          * If ip represents a regular file the consolidation code essentially
2128          * does nothing other than return the locked chain.
2129          *
2130          * The returned chain will be locked.
2131          *
2132          * WARNING!  We do not currently have a local copy of ipdata but
2133          *           we do use one later remember that it must be reloaded
2134          *           on any modification to the inode, including connects.
2135          */
2136         hammer2_inode_lock_ex(ip);
2137         error = hammer2_hardlink_consolidate(&trans, ip, &chain, tdip, 0);
2138         if (error)
2139                 goto done;
2140
2141         /*
2142          * Disconnect (fdip, fname) from the source directory.  This will
2143          * disconnect (ip) if it represents a direct file.  If (ip) represents
2144          * a hardlink the HARDLINK pointer object will be removed but the
2145          * hardlink will stay intact.
2146          *
2147          * The target chain may be marked DELETED but will not be destroyed
2148          * since we retain our hold on ip and chain.
2149          */
2150         error = hammer2_unlink_file(&trans, fdip, fname, fname_len, -1, &hlink);
2151         KKASSERT(error != EAGAIN);
2152         if (error)
2153                 goto done;
2154
2155         /*
2156          * Reconnect ip to target directory using chain.  Chains cannot
2157          * actually be moved, so this will duplicate the chain in the new
2158          * spot and assign it to the ip, replacing the old chain.
2159          *
2160          * WARNING: chain locks can lock buffer cache buffers, to avoid
2161          *          deadlocks we want to unlock before issuing a cache_*()
2162          *          op (that might have to lock a vnode).
2163          */
2164         error = hammer2_inode_connect(&trans, hlink,
2165                                       tdip, &chain,
2166                                       tname, tname_len);
2167         if (error == 0) {
2168                 KKASSERT(chain != NULL);
2169                 hammer2_inode_repoint(ip, (hlink ? ip->pip : tdip), chain);
2170                 cache_rename(ap->a_fnch, ap->a_tnch);
2171         }
2172 done:
2173         if (chain)
2174                 hammer2_chain_unlock(chain);
2175         hammer2_inode_unlock_ex(ip);
2176         hammer2_inode_drop(ip);
2177         hammer2_trans_done(&trans);
2178
2179         return (error);
2180 }
2181
2182 static int hammer2_strategy_read(struct vop_strategy_args *ap);
2183 static int hammer2_strategy_write(struct vop_strategy_args *ap);
2184
2185 static
2186 int
2187 hammer2_vop_strategy(struct vop_strategy_args *ap)
2188 {
2189         struct bio *biop;
2190         struct buf *bp;
2191         int error;
2192
2193         biop = ap->a_bio;
2194         bp = biop->bio_buf;
2195
2196         switch(bp->b_cmd) {
2197         case BUF_CMD_READ:
2198                 error = hammer2_strategy_read(ap);
2199                 ++hammer2_iod_file_read;
2200                 break;
2201         case BUF_CMD_WRITE:
2202                 error = hammer2_strategy_write(ap);
2203                 ++hammer2_iod_file_write;
2204                 break;
2205         default:
2206                 bp->b_error = error = EINVAL;
2207                 bp->b_flags |= B_ERROR;
2208                 biodone(biop);
2209                 break;
2210         }
2211
2212         return (error);
2213 }
2214
2215 static
2216 int
2217 hammer2_strategy_read(struct vop_strategy_args *ap)
2218 {
2219         struct buf *bp;
2220         struct bio *bio;
2221         struct bio *nbio;
2222         hammer2_mount_t *hmp;
2223         hammer2_inode_t *ip;
2224         hammer2_chain_t *parent;
2225         hammer2_chain_t *chain;
2226         hammer2_key_t lbase;
2227
2228         bio = ap->a_bio;
2229         bp = bio->bio_buf;
2230         ip = VTOI(ap->a_vp);
2231         hmp = ip->hmp;
2232         nbio = push_bio(bio);
2233
2234         lbase = bio->bio_offset;
2235         chain = NULL;
2236         KKASSERT(((int)lbase & HAMMER2_PBUFMASK) == 0);
2237
2238         /*
2239          * We must characterize the logical->physical translation if it
2240          * has not already been cached.
2241          *
2242          * Physical data references < LBUFSIZE are never cached.  This
2243          * includes both small-block allocations and inode-embedded data.
2244          */
2245         if (nbio->bio_offset == NOOFFSET) {
2246                 hammer2_inode_lock_sh(ip);
2247
2248                 parent = hammer2_chain_lookup_init(ip->chain,
2249                                                    HAMMER2_LOOKUP_SHARED);
2250
2251                 chain = hammer2_chain_lookup(&parent, lbase, lbase,
2252                                              HAMMER2_LOOKUP_NODATA |
2253                                              HAMMER2_LOOKUP_SHARED);
2254                 if (chain == NULL) {
2255                         /*
2256                          * Data is zero-fill
2257                          */
2258                         nbio->bio_offset = ZFOFFSET;
2259                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2260                         /*
2261                          * Data is embedded in the inode (do nothing)
2262                          */
2263                         KKASSERT(chain == parent);
2264                         hammer2_chain_unlock(chain);
2265                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
2266                         /*
2267                          * Data is on-media
2268                          */
2269                         KKASSERT(bp->b_bcount == chain->bytes);
2270                         nbio->bio_offset = chain->bref.data_off &
2271                                            HAMMER2_OFF_MASK;
2272                         hammer2_chain_unlock(chain);
2273                         KKASSERT(nbio->bio_offset != 0);
2274                 } else {
2275                         panic("hammer2_strategy_read: unknown bref type");
2276                 }
2277                 hammer2_chain_lookup_done(parent);
2278                 hammer2_inode_unlock_sh(ip);
2279         }
2280
2281         if (hammer2_debug & 0x0020) {
2282                 kprintf("read %016jx %016jx\n",
2283                         bio->bio_offset, nbio->bio_offset);
2284         }
2285
2286         if (nbio->bio_offset == ZFOFFSET) {
2287                 /*
2288                  * Data is zero-fill
2289                  */
2290                 bp->b_resid = 0;
2291                 bp->b_error = 0;
2292                 bzero(bp->b_data, bp->b_bcount);
2293                 biodone(nbio);
2294         } else if (nbio->bio_offset != NOOFFSET) {
2295                 /*
2296                  * Forward direct IO to the device
2297                  */
2298                 vn_strategy(hmp->devvp, nbio);
2299         } else {
2300                 /*
2301                  * Data is embedded in inode.
2302                  */
2303                 bcopy(chain->data->ipdata.u.data, bp->b_data,
2304                       HAMMER2_EMBEDDED_BYTES);
2305                 bzero(bp->b_data + HAMMER2_EMBEDDED_BYTES,
2306                       bp->b_bcount - HAMMER2_EMBEDDED_BYTES);
2307                 bp->b_resid = 0;
2308                 bp->b_error = 0;
2309                 biodone(nbio);
2310         }
2311         return (0);
2312 }
2313
2314 static
2315 int
2316 hammer2_strategy_write(struct vop_strategy_args *ap)
2317 {
2318         struct buf *bp;
2319         struct bio *bio;
2320         struct bio *nbio;
2321         hammer2_mount_t *hmp;
2322         hammer2_inode_t *ip;
2323
2324         bio = ap->a_bio;
2325         bp = bio->bio_buf;
2326         ip = VTOI(ap->a_vp);
2327         hmp = ip->hmp;
2328         nbio = push_bio(bio);
2329
2330         KKASSERT((bio->bio_offset & HAMMER2_PBUFMASK64) == 0);
2331         KKASSERT(nbio->bio_offset != 0 && nbio->bio_offset != ZFOFFSET);
2332
2333         if (nbio->bio_offset == NOOFFSET) {
2334                 /*
2335                  * Must be embedded in the inode.
2336                  *
2337                  * Because the inode is dirty, the chain must exist whether
2338                  * the inode is locked or not. XXX
2339                  */
2340                 KKASSERT(bio->bio_offset == 0);
2341                 KKASSERT(ip->chain && ip->chain->data);
2342                 bcopy(bp->b_data, ip->chain->data->ipdata.u.data,
2343                       HAMMER2_EMBEDDED_BYTES);
2344                 bp->b_resid = 0;
2345                 bp->b_error = 0;
2346                 biodone(nbio);
2347
2348                 /*
2349                  * This special flag does not follow the normal MODIFY rules
2350                  * because we might deadlock on ip.  Instead we depend on
2351                  * VOP_FSYNC() to detect the case.
2352                  */
2353                 atomic_set_int(&ip->flags, HAMMER2_INODE_DIRTYEMBED);
2354         } else {
2355                 /*
2356                  * Forward direct IO to the device
2357                  */
2358                 vn_strategy(hmp->devvp, nbio);
2359         }
2360         return (0);
2361 }
2362
2363 /*
2364  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2365  */
2366 static
2367 int
2368 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2369 {
2370         hammer2_mount_t *hmp;
2371         hammer2_inode_t *ip;
2372         int error;
2373
2374         ip = VTOI(ap->a_vp);
2375         hmp = ip->hmp;
2376
2377         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2378                               ap->a_fflag, ap->a_cred);
2379         return (error);
2380 }
2381
2382 static
2383 int 
2384 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2385 {
2386         struct mount *mp;
2387         hammer2_pfsmount_t *pmp;
2388         int rc;
2389
2390         switch (ap->a_op) {
2391         case (MOUNTCTL_SET_EXPORT):
2392                 mp = ap->a_head.a_ops->head.vv_mount;
2393                 pmp = MPTOPMP(mp);
2394
2395                 if (ap->a_ctllen != sizeof(struct export_args))
2396                         rc = (EINVAL);
2397                 else
2398                         rc = vfs_export(mp, &pmp->export,
2399                                         (const struct export_args *)ap->a_ctl);
2400                 break;
2401         default:
2402                 rc = vop_stdmountctl(ap);
2403                 break;
2404         }
2405         return (rc);
2406 }
2407
2408 struct vop_ops hammer2_vnode_vops = {
2409         .vop_default    = vop_defaultop,
2410         .vop_fsync      = hammer2_vop_fsync,
2411         .vop_getpages   = vop_stdgetpages,
2412         .vop_putpages   = vop_stdputpages,
2413         .vop_access     = hammer2_vop_access,
2414         .vop_advlock    = hammer2_vop_advlock,
2415         .vop_close      = hammer2_vop_close,
2416         .vop_nlink      = hammer2_vop_nlink,
2417         .vop_ncreate    = hammer2_vop_ncreate,
2418         .vop_nsymlink   = hammer2_vop_nsymlink,
2419         .vop_nremove    = hammer2_vop_nremove,
2420         .vop_nrmdir     = hammer2_vop_nrmdir,
2421         .vop_nrename    = hammer2_vop_nrename,
2422         .vop_getattr    = hammer2_vop_getattr,
2423         .vop_setattr    = hammer2_vop_setattr,
2424         .vop_readdir    = hammer2_vop_readdir,
2425         .vop_readlink   = hammer2_vop_readlink,
2426         .vop_getpages   = vop_stdgetpages,
2427         .vop_putpages   = vop_stdputpages,
2428         .vop_read       = hammer2_vop_read,
2429         .vop_write      = hammer2_vop_write,
2430         .vop_open       = hammer2_vop_open,
2431         .vop_inactive   = hammer2_vop_inactive,
2432         .vop_reclaim    = hammer2_vop_reclaim,
2433         .vop_nresolve   = hammer2_vop_nresolve,
2434         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2435         .vop_nmkdir     = hammer2_vop_nmkdir,
2436         .vop_ioctl      = hammer2_vop_ioctl,
2437         .vop_mountctl   = hammer2_vop_mountctl,
2438         .vop_bmap       = hammer2_vop_bmap,
2439         .vop_strategy   = hammer2_vop_strategy,
2440 };
2441
2442 struct vop_ops hammer2_spec_vops = {
2443
2444 };
2445
2446 struct vop_ops hammer2_fifo_vops = {
2447
2448 };